-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcrawler_huaban.py
More file actions
85 lines (73 loc) · 2.66 KB
/
crawler_huaban.py
File metadata and controls
85 lines (73 loc) · 2.66 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/env python
# coding: utf-8
### crawler_huaban.py
"""
爬虫,抓取花瓣网图片。
摘自https://github.com/nan86150/Crawler-Demo/blob/master/huaban/HuabanCrawler.py
"""
import requests
import re
import os
import os.path
class HuabanCrawler():
""" 抓取花瓣网上的图片 """
def __init__(self):
""" 在当前文件夹下新建images文件夹存放抓取的图片 """
self.homeUrl = "http://huaban.com/favorite/beauty/"
self.images = []
if not os.path.exists('./images'):
os.mkdir('./images')
def __load_homePage(self):
""" 加载主页面 """
homePage = requests.get(url = self.homeUrl).content
return homePage
def __make_ajax_url(self, No):
""" 返回ajax请求的url """
return self.homeUrl + "?i5p998kw&max=" + No + "&limit=20&wfl=1"
def __load_more(self, maxNo):
""" 刷新页面 """
return requests.get(url = self.__make_ajax_url(maxNo)).content
def __process_data(self, htmlPage):
""" 从html页面中提取图片的信息 """
prog = re.compile(r'app\.page\["pins"\].*')
appPins = prog.findall(htmlPage)
# 将js中的null定义为Python中的None
null = None
true = True
if appPins == []:
return None
result = eval(appPins[0][19:-1])
for i in result:
info = {}
info['id'] = str(i['pin_id'])
info['url'] = "http://img.hb.aicdn.com/" + i["file"]["key"] + "_fw658"
if 'image' == i["file"]["type"][:5]:
info['type'] = i["file"]["type"][6:]
else:
info['type'] = 'NoName'
self.images.append(info)
def __save_image(self, imageName, content):
""" 保存图片 """
with open(imageName, 'wb') as fp:
fp.write(content)
def get_image_info(self, num=20):
""" 得到图片信息 """
self.__process_data(self.__load_homePage())
for i in range((num-1)/20):
self.__process_data(self.__load_more(self.images[-1]['id']))
return self.images
def down_images(self):
""" 下载图片 """
print "{} image will be download".format(len(self.images))
for key, image in enumerate(self.images):
print 'download {0} ...'.format(key)
try:
req = requests.get(image["url"])
except :
print 'error'
imageName = os.path.join("./images", image["id"] + "." + image["type"])
self.__save_image(imageName, req.content)
if __name__ == '__main__':
hc = HuabanCrawler()
hc.get_image_info(200)
hc.down_images()