forked from gaopu/Java
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathGetBookInfoThread.java
More file actions
96 lines (85 loc) · 4.02 KB
/
GetBookInfoThread.java
File metadata and controls
96 lines (85 loc) · 4.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import org.apache.http.HttpHeaders;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.util.EntityUtils;
import org.dom4j.Element;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class GetBookInfoThread extends Thread{
private CloseableHttpClient httpClient;
private String webAddress;
private Element rootElement;
private Pattern bookAuthorRegex;
private Pattern bookPublishRegex;
private Pattern bookIsbnRegex;
private Pattern bookImgRegex;
private String bookName;
/**
*
* @param httpClient 用这个操作抓取
* @param webAddress 这个是抓取的网址
* @param rootElement 这个是一个xml文档的根节点,用这个来操作加入新的子节点
*/
public GetBookInfoThread(CloseableHttpClient httpClient,String webAddress,String bookName,Element rootElement,Pattern bookAuthorRegex,Pattern bookPublishRegex,Pattern bookIsbnRegex,Pattern bookImgRegex) {
this.httpClient = httpClient;
this.webAddress = webAddress;
this.rootElement = rootElement;
this.bookAuthorRegex = bookAuthorRegex;
this.bookPublishRegex = bookPublishRegex;
this.bookIsbnRegex = bookIsbnRegex;
this.bookName = bookName;
this.bookImgRegex = bookImgRegex;
}
@Override
public void run() {
HttpGet getBookInfo = new HttpGet(webAddress);
getBookInfo.addHeader(HttpHeaders.USER_AGENT, "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30");
CloseableHttpResponse bookInfoResponse;
String bookInfoCode = null;//书籍具体信息网页源码
try {
bookInfoResponse = httpClient.execute(getBookInfo);
if (bookInfoResponse.getStatusLine().getStatusCode() != 200) {
System.out.println("获取书本具体信息时出错,页面地址:" + webAddress + "错误信息" + bookInfoResponse.getStatusLine());
return;
}
bookInfoCode = EntityUtils.toString(bookInfoResponse.getEntity());
} catch (IOException e) {
e.printStackTrace();
}
Matcher bookAuthorMatcher = bookAuthorRegex.matcher(bookInfoCode); //匹配作者
Matcher bookPublishMatcher = bookPublishRegex.matcher(bookInfoCode); //匹配出版商
Matcher bookIsbnMatcher = bookIsbnRegex.matcher(bookInfoCode); //匹配isbn
Matcher bookImgMatcher = bookImgRegex.matcher(bookInfoCode); //匹配图片地址
String bookName = this.bookName;
String bookAuthor = "";
String bookPublish = "";
String bookIsbn = "";
String bookLink = webAddress;
String bookImg = "";
if (bookAuthorMatcher.find()) {
bookAuthor = bookAuthorMatcher.group(1);
}
if (bookPublishMatcher.find()) {
bookPublish = bookPublishMatcher.group(1);
}
if (bookIsbnMatcher.find()) {
bookIsbn = bookIsbnMatcher.group(1);
}
if (bookImgMatcher.find()) {
bookImg = bookImgMatcher.group(1);
}
// System.out.println(bookName + "-" + bookAuthor + "-" + bookPublish + "-" + bookIsbn);
Element bookElement = rootElement.addElement("book");//新建一个书的标签
bookElement.addAttribute("id",String.valueOf(Main.bookId++));
bookElement.addElement("name").setText(bookName);
bookElement.addElement("author").setText(bookAuthor);
bookElement.addElement("publish").setText(bookPublish);
bookElement.addElement("isbn").setText(bookIsbn);
bookElement.addElement("count").setText(String.valueOf((int)(Math.random() * 10) + 3));
bookElement.addElement("link").setText(bookLink);
bookElement.addElement("img").setText(bookImg);
System.out.println("抓取了:" + webAddress + " " + bookName);
}
}