forked from gaopu/Java
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathCrawlUtils.java
More file actions
124 lines (109 loc) · 4.01 KB
/
CrawlUtils.java
File metadata and controls
124 lines (109 loc) · 4.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
package com.crawl.comments;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.dom4j.Element;
import org.dom4j.io.XMLWriter;
import org.json.JSONObject;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Writer;
import java.util.HashSet;
import java.util.Set;
/**
* Created by geekgao on 15-10-25.
*/
public class CrawlUtils {
/**
*
* @param id appid
* @return app名字
*/
public static String getAppName(String id) throws IOException {
CloseableHttpClient client = HttpClients.createDefault();
HttpGet get = new HttpGet("http://zhushou.360.cn/detail/index/soft_id/" + id);
CloseableHttpResponse response;
try {
response = client.execute(get);
} catch (java.net.UnknownHostException e) {
return null;
}
return EntityUtils.toString(response.getEntity()).split("<title>")[1].split("<")[0];
}
/**
*
* @param xml xml文档
* @param fileName 存储到这个地方
*/
public static void writeXmlToFile(Element xml,String fileName) throws IOException {
Writer fileWriter = new FileWriter(fileName);
XMLWriter xmlWriter = new XMLWriter(fileWriter);
xmlWriter.write(xml);
xmlWriter.close();
}
/**
* 获取需要下载的app的id
* @param uri app类别页
* @param limit 获取前limit个app的评论
* @return
*/
public static Set<String> getAppIds(String uri,int limit) throws IOException {
/*//因为根据网页源码每个appid会匹配到两次,所以获取limit个就必须获取2*limit次
limit = limit * 2;
Set<String> appIds = null;
//获取网页源码,得到appid
HttpGet get = new HttpGet(uri);
CloseableHttpResponse response = client.execute(get);
String html = EntityUtils.toString(response.getEntity());
Pattern getAppIdRegex = Pattern.compile("(?m)/detail/index/soft_id/(.*?)\"");
Matcher matcher = getAppIdRegex.matcher(html);
//至少有一个结果才new一个set
if (matcher.find()) {
appIds = new HashSet<String>();
} else {
return appIds;
}
//控制获取的appid个数
int count = 0;
//把所有匹配到的appid加入到结果中
do {
if (count < limit) {
appIds.add(matcher.group(1));
count++;
}
} while (matcher.find());
return appIds;*/
Set<String> s = new HashSet<String>();
// s.add("3581");
// s.add("778702");
// s.add("1586");
// s.add("6276");
// s.add("122437");
// s.add("5632");
// s.add("4107");
// s.add("98008");
// s.add("3100672");
// s.add("2345172");
// s.add("1343");
// s.add("3094256");
// s.add("101594");
// s.add("1840672");
// s.add("1643");
// s.add("893686");
// s.add("3032510");
s.add("1936882");
// s.add("7256");
// s.add("727030");
return s;
}
public static int getCommentCount(int appId) throws IOException {
CloseableHttpClient client = HttpClients.createDefault();
HttpGet getJson = new HttpGet("http://comment.mobilem.360.cn/comment/getComments?baike=" + appId + "&level=0&start=0&count=1&fm=home_jingjia_3&m=c1804fc5ca4ded8293acd1151efaf3db&m2=61f3c1e4d105b55aff323b20a8136c4e&v=3.2.50&re=1&nt=1&ch=493041&os=21&model=MX4+Pro&sn=4.66476154040931&cu=m76&ca1=armeabi-v7a&ca2=armeabi&ppi=1536x2560&cpc=1&startCount=4");
CloseableHttpResponse response = client.execute(getJson);
String json = EntityUtils.toString(response.getEntity());
JSONObject jsonObject = new JSONObject(json);
return jsonObject.getJSONObject("data").getInt("total");
}
}