java为什么我从html中获得unicode?
我编写了一个没有第三方库的解析器。从网站获取html代码-http://www.cnn.com/,但代码的某些部分使用unicode符号,例如:“\u003cbr/>;登录电视服务提供商以访问\u003cbr/>;”我认为这是编码的问题-我如何修复它?对不起我的英语。谢谢
public class Main {
public static void main(String[] args) throws IOException {
String commandLine = Scraper.readLineFromConsole();
Reader reader = Scraper.getReader(commandLine);
Scraper.writeInFileFromURL(reader);
}
public static class Scraper {
public static void writeInFileFromURL(Reader out) {
Reader reader = out;
BufferedReader br = new BufferedReader(reader);
try {
PrintWriter writer = new PrintWriter("newFile.txt");
String htmltext;
while (br.ready()) {
htmltext = br.readLine();
writer.write(new String(htmltext));
}
writer.flush();
writer.close();
} catch (IOException e) {
e.printStackTrace();
}
}
public static String readLineFromConsole() {
BufferedReader reader = new BufferedReader(new InputStreamReader(System.in));
String commandLine = null;
try {
commandLine = reader.readLine();
} catch (IOException e) {
e.printStackTrace();
}
return commandLine;
}
public static Reader getReader(String url)
throws IOException {
// Retrieve from Internet.
if (url.startsWith("http:") || url.startsWith("https:")) {
URLConnection conn = new URL(url).openConnection();
return new InputStreamReader(conn.getInputStream());
}
// Retrieve from file.
else {
return new FileReader(url);
}
}
}
}
共 (0) 个答案