Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Latest commit

 

History

History
History
85 lines (73 loc) · 2.66 KB

File metadata and controls

85 lines (73 loc) · 2.66 KB
Copy raw file
Download raw file
Open symbols panel
Edit and raw actions
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/env python
# coding: utf-8
### crawler_huaban.py
"""
爬虫,抓取花瓣网图片。
摘自https://github.com/nan86150/Crawler-Demo/blob/master/huaban/HuabanCrawler.py
"""
import requests
import re
import os
import os.path
class HuabanCrawler():
""" 抓取花瓣网上的图片 """
def __init__(self):
""" 在当前文件夹下新建images文件夹存放抓取的图片 """
self.homeUrl = "http://huaban.com/favorite/beauty/"
self.images = []
if not os.path.exists('./images'):
os.mkdir('./images')
def __load_homePage(self):
""" 加载主页面 """
homePage = requests.get(url = self.homeUrl).content
return homePage
def __make_ajax_url(self, No):
""" 返回ajax请求的url """
return self.homeUrl + "?i5p998kw&max=" + No + "&limit=20&wfl=1"
def __load_more(self, maxNo):
""" 刷新页面 """
return requests.get(url = self.__make_ajax_url(maxNo)).content
def __process_data(self, htmlPage):
""" 从html页面中提取图片的信息 """
prog = re.compile(r'app\.page\["pins"\].*')
appPins = prog.findall(htmlPage)
# 将js中的null定义为Python中的None
null = None
true = True
if appPins == []:
return None
result = eval(appPins[0][19:-1])
for i in result:
info = {}
info['id'] = str(i['pin_id'])
info['url'] = "http://img.hb.aicdn.com/" + i["file"]["key"] + "_fw658"
if 'image' == i["file"]["type"][:5]:
info['type'] = i["file"]["type"][6:]
else:
info['type'] = 'NoName'
self.images.append(info)
def __save_image(self, imageName, content):
""" 保存图片 """
with open(imageName, 'wb') as fp:
fp.write(content)
def get_image_info(self, num=20):
""" 得到图片信息 """
self.__process_data(self.__load_homePage())
for i in range((num-1)/20):
self.__process_data(self.__load_more(self.images[-1]['id']))
return self.images
def down_images(self):
""" 下载图片 """
print "{} image will be download".format(len(self.images))
for key, image in enumerate(self.images):
print 'download {0} ...'.format(key)
try:
req = requests.get(image["url"])
except :
print 'error'
imageName = os.path.join("./images", image["id"] + "." + image["type"])
self.__save_image(imageName, req.content)
if __name__ == '__main__':
hc = HuabanCrawler()
hc.get_image_info(200)
hc.down_images()
Morty Proxy This is a proxified and sanitized view of the page, visit original site.