Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Latest commit

 

History

History
History
82 lines (71 loc) · 2.42 KB

File metadata and controls

82 lines (71 loc) · 2.42 KB
Copy raw file
Download raw file
Open symbols panel
Edit and raw actions
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/env python
# encoding=utf-8
import requests
import re
import codecs
from bs4 import BeautifulSoup
from openpyxl import Workbook
wb = Workbook()
dest_filename = '电影.xlsx'
ws1 = wb.active
ws1.title = "电影top250"
DOWNLOAD_URL = 'http://movie.douban.com/top250/'
def download_page(url):
"""获取url地址页面内容"""
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36'
}
data = requests.get(url, headers=headers).content
return data
def get_li(doc):
soup = BeautifulSoup(doc, 'html.parser')
ol = soup.find('ol', class_='grid_view')
name = [] # 名字
star_con = [] # 评价人数
score = [] # 评分
info_list = [] # 短评
for i in ol.find_all('li'):
detail = i.find('div', attrs={'class': 'hd'})
movie_name = detail.find(
'span', attrs={'class': 'title'}).get_text() # 电影名字
level_star = i.find(
'span', attrs={'class': 'rating_num'}).get_text() # 评分
star = i.find('div', attrs={'class': 'star'})
star_num = star.find(text=re.compile('评价')) # 评价
info = i.find('span', attrs={'class': 'inq'}) # 短评
if info: # 判断是否有短评
info_list.append(info.get_text())
else:
info_list.append('无')
score.append(level_star)
name.append(movie_name)
star_con.append(star_num)
page = soup.find('span', attrs={'class': 'next'}).find('a') # 获取下一页
if page:
return name, star_con, score, info_list, DOWNLOAD_URL + page['href']
return name, star_con, score, info_list, None
def main():
url = DOWNLOAD_URL
name = []
star_con = []
score = []
info = []
while url:
doc = download_page(url)
movie, star, level_num, info_list, url = get_li(doc)
name = name + movie
star_con = star_con + star
score = score + level_num
info = info + info_list
for (i, m, o, p) in zip(name, star_con, score, info):
col_A = 'A%s' % (name.index(i) + 1)
col_B = 'B%s' % (name.index(i) + 1)
col_C = 'C%s' % (name.index(i) + 1)
col_D = 'D%s' % (name.index(i) + 1)
ws1[col_A] = i
ws1[col_B] = m
ws1[col_C] = o
ws1[col_D] = p
wb.save(filename=dest_filename)
if __name__ == '__main__':
main()
Morty Proxy This is a proxified and sanitized view of the page, visit original site.