Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Latest commit

 

History

History
History
55 lines (46 loc) · 1.37 KB

File metadata and controls

55 lines (46 loc) · 1.37 KB
Copy raw file
Download raw file
Open symbols panel
Edit and raw actions
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from bs4 import BeautifulSoup
import requests
from openpyxl import Workbook
excel_name = "书籍.xlsx"
wb = Workbook()
ws1 = wb.active
ws1.title='书籍'
def get_html(url):
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0'}
html = requests.get(url, headers=header).content
return html
def get_con(html):
soup = BeautifulSoup(html,'html.parser')
book_list = soup.find('div', attrs={'class': 'article'})
page = soup.find('div', attrs={'class': 'paginator'})
next_page = page.find('span', attrs={'class': 'next'}).find('a')
name = []
for i in book_list.find_all('table'):
book_name = i.find('div', attrs={'class': 'pl2'})
m = list(book_name.find('a').stripped_strings)
if len(m)>1:
x = m[0]+m[1]
else:
x = m[0]
#print(x)
name.append(x)
if next_page:
return name, next_page.get('href')
else:
return name, None
def main():
url = 'https://book.douban.com/top250'
name_list=[]
while url:
html = get_html(url)
name, url = get_con(html)
name_list = name_list + name
for i in name_list:
location = 'A%s'%(name_list.index(i)+1)
print(i)
print(location)
ws1[location]=i
wb.save(filename=excel_name)
if __name__ == '__main__':
main()
Morty Proxy This is a proxified and sanitized view of the page, visit original site.