Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Latest commit

 

History

History
History
96 lines (85 loc) · 4.02 KB

File metadata and controls

96 lines (85 loc) · 4.02 KB
Copy raw file
Download raw file
Open symbols panel
Edit and raw actions
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import org.apache.http.HttpHeaders;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.util.EntityUtils;
import org.dom4j.Element;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class GetBookInfoThread extends Thread{
private CloseableHttpClient httpClient;
private String webAddress;
private Element rootElement;
private Pattern bookAuthorRegex;
private Pattern bookPublishRegex;
private Pattern bookIsbnRegex;
private Pattern bookImgRegex;
private String bookName;
/**
*
* @param httpClient 用这个操作抓取
* @param webAddress 这个是抓取的网址
* @param rootElement 这个是一个xml文档的根节点,用这个来操作加入新的子节点
*/
public GetBookInfoThread(CloseableHttpClient httpClient,String webAddress,String bookName,Element rootElement,Pattern bookAuthorRegex,Pattern bookPublishRegex,Pattern bookIsbnRegex,Pattern bookImgRegex) {
this.httpClient = httpClient;
this.webAddress = webAddress;
this.rootElement = rootElement;
this.bookAuthorRegex = bookAuthorRegex;
this.bookPublishRegex = bookPublishRegex;
this.bookIsbnRegex = bookIsbnRegex;
this.bookName = bookName;
this.bookImgRegex = bookImgRegex;
}
@Override
public void run() {
HttpGet getBookInfo = new HttpGet(webAddress);
getBookInfo.addHeader(HttpHeaders.USER_AGENT, "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30");
CloseableHttpResponse bookInfoResponse;
String bookInfoCode = null;//书籍具体信息网页源码
try {
bookInfoResponse = httpClient.execute(getBookInfo);
if (bookInfoResponse.getStatusLine().getStatusCode() != 200) {
System.out.println("获取书本具体信息时出错,页面地址:" + webAddress + "错误信息" + bookInfoResponse.getStatusLine());
return;
}
bookInfoCode = EntityUtils.toString(bookInfoResponse.getEntity());
} catch (IOException e) {
e.printStackTrace();
}
Matcher bookAuthorMatcher = bookAuthorRegex.matcher(bookInfoCode); //匹配作者
Matcher bookPublishMatcher = bookPublishRegex.matcher(bookInfoCode); //匹配出版商
Matcher bookIsbnMatcher = bookIsbnRegex.matcher(bookInfoCode); //匹配isbn
Matcher bookImgMatcher = bookImgRegex.matcher(bookInfoCode); //匹配图片地址
String bookName = this.bookName;
String bookAuthor = "";
String bookPublish = "";
String bookIsbn = "";
String bookLink = webAddress;
String bookImg = "";
if (bookAuthorMatcher.find()) {
bookAuthor = bookAuthorMatcher.group(1);
}
if (bookPublishMatcher.find()) {
bookPublish = bookPublishMatcher.group(1);
}
if (bookIsbnMatcher.find()) {
bookIsbn = bookIsbnMatcher.group(1);
}
if (bookImgMatcher.find()) {
bookImg = bookImgMatcher.group(1);
}
// System.out.println(bookName + "-" + bookAuthor + "-" + bookPublish + "-" + bookIsbn);
Element bookElement = rootElement.addElement("book");//新建一个书的标签
bookElement.addAttribute("id",String.valueOf(Main.bookId++));
bookElement.addElement("name").setText(bookName);
bookElement.addElement("author").setText(bookAuthor);
bookElement.addElement("publish").setText(bookPublish);
bookElement.addElement("isbn").setText(bookIsbn);
bookElement.addElement("count").setText(String.valueOf((int)(Math.random() * 10) + 3));
bookElement.addElement("link").setText(bookLink);
bookElement.addElement("img").setText(bookImg);
System.out.println("抓取了:" + webAddress + " " + bookName);
}
}
Morty Proxy This is a proxified and sanitized view of the page, visit original site.