Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Latest commit

 

History

History
History
116 lines (88 loc) · 3.78 KB

File metadata and controls

116 lines (88 loc) · 3.78 KB
Copy raw file
Download raw file
Open symbols panel
Edit and raw actions
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import xlwt
browser = webdriver.PhantomJS()
WAIT = WebDriverWait(browser, 10)
browser.set_window_size(1400, 900)
book = xlwt.Workbook(encoding='utf-8', style_compression=0)
sheet = book.add_sheet('蔡徐坤篮球', cell_overwrite_ok=True)
sheet.write(0, 0, '名称')
sheet.write(0, 1, '地址')
sheet.write(0, 2, '描述')
sheet.write(0, 3, '观看次数')
sheet.write(0, 4, '弹幕数')
sheet.write(0, 5, '发布时间')
n = 1
def search():
try:
print('开始访问b站....')
browser.get("https://www.bilibili.com/")
# 被那个破登录遮住了
index = WAIT.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#primary_menu > ul > li.home > a")))
index.click()
input = WAIT.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#banner_link > div > div > form > input")))
submit = WAIT.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="banner_link"]/div/div/form/button')))
input.send_keys('蔡徐坤 篮球')
submit.click()
# 跳转到新的窗口
print('跳转到新窗口')
all_h = browser.window_handles
browser.switch_to.window(all_h[1])
get_source()
total = WAIT.until(EC.presence_of_element_located((By.CSS_SELECTOR,
"#server-search-app > div.contain > div.body-contain > div > div.page-wrap > div > ul > li.page-item.last > button")))
return int(total.text)
except TimeoutException:
return search()
def next_page(page_num):
try:
print('获取下一页数据')
next_btn = WAIT.until(EC.element_to_be_clickable((By.CSS_SELECTOR,
'#server-search-app > div.contain > div.body-contain > div > div.page-wrap > div > ul > li.page-item.next > button')))
next_btn.click()
WAIT.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,
'#server-search-app > div.contain > div.body-contain > div > div.page-wrap > div > ul > li.page-item.active > button'),
str(page_num)))
get_source()
except TimeoutException:
browser.refresh()
return next_page(page_num)
def save_to_excel(soup):
list = soup.find(class_='all-contain').find_all(class_='info')
for item in list:
item_title = item.find('a').get('title')
item_link = item.find('a').get('href')
item_dec = item.find(class_='des hide').text
item_view = item.find(class_='so-icon watch-num').text
item_biubiu = item.find(class_='so-icon hide').text
item_date = item.find(class_='so-icon time').text
print('爬取:' + item_title)
global n
sheet.write(n, 0, item_title)
sheet.write(n, 1, item_link)
sheet.write(n, 2, item_dec)
sheet.write(n, 3, item_view)
sheet.write(n, 4, item_biubiu)
sheet.write(n, 5, item_date)
n = n + 1
def get_source():
WAIT.until(EC.presence_of_element_located(
(By.CSS_SELECTOR, '#server-search-app > div.contain > div.body-contain > div > div.result-wrap.clearfix')))
html = browser.page_source
soup = BeautifulSoup(html, 'lxml')
save_to_excel(soup)
def main():
try:
total = search()
print(total)
for i in range(2, int(total + 1)):
next_page(i)
finally:
browser.close()
if __name__ == '__main__':
main()
book.save(u'蔡徐坤篮球.xlsx')
Morty Proxy This is a proxified and sanitized view of the page, visit original site.