Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 0ff6019

Browse filesBrowse files
committed
udapte
1 parent e442c25 commit 0ff6019
Copy full SHA for 0ff6019

File tree

Expand file treeCollapse file tree

8 files changed

+33230
-571
lines changed
Open diff view settings
Filter options
Expand file treeCollapse file tree

8 files changed

+33230
-571
lines changed
Open diff view settings
Collapse file

‎Pipfile‎

Copy file name to clipboardExpand all lines: Pipfile
+5Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,11 @@ verify_ssl = true
77

88
[packages]
99
requests = {extras = ["socks"],version = "*"}
10+
jieba = "*"
11+
numpy = "*"
12+
scipy = "*"
13+
imageio = "*"
14+
wordcloud = "*"
1015

1116
[requires]
1217
python_version = "3.7"
Collapse file

‎Pipfile.lock‎

Copy file name to clipboardExpand all lines: Pipfile.lock
+216-1Lines changed: 216 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Collapse file

‎stopwords.txt‎

Copy file name to clipboard
+113Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
the
2+
of
3+
is
4+
and
5+
to
6+
he
7+
she
8+
...
9+
just
10+
so
11+
my
12+
in
13+
..
14+
that
15+
we
16+
very
17+
me
18+
was
19+
but
20+
your
21+
how
22+
do
23+
ng
24+
for
25+
an
26+
are
27+
by
28+
be
29+
as
30+
on
31+
with
32+
can
33+
if
34+
from
35+
which
36+
you
37+
it
38+
this
39+
then
40+
at
41+
have
42+
all
43+
not
44+
one
45+
has
46+
or
47+
that
48+
49+
50+
51+
52+
53+
54+
55+
56+
57+
58+
59+
什么
60+
还是
61+
就是
62+
还要
63+
可以
64+
没有
65+
看看
66+
怎么
67+
那么
68+
不能
69+
分享
70+
出来
71+
已经
72+
下载
73+
有点
74+
今天
75+
很多
76+
因为
77+
你们
78+
完全
79+
一次
80+
quot
81+
不是
82+
这样
83+
这么
84+
觉得
85+
知道
86+
只有
87+
不过
88+
需要
89+
还有
90+
一个
91+
这个
92+
回复
93+
现在
94+
不错
95+
大家
96+
应该
97+
我刚
98+
不会
99+
如果
100+
时候
101+
开始
102+
正在
103+
为啥
104+
各种
105+
一個
106+
沒有
107+
我們
108+
你們
109+
妳們
110+
他們
111+
她們
112+
是否
113+
留言
Collapse file

‎text.txt‎

Copy file name to clipboardExpand all lines: text.txt
+27,642Lines changed: 27642 additions & 0 deletions
Large diffs are not rendered by default.
Collapse file

‎utils.py‎

Copy file name to clipboard
+41Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
import json
2+
import re
3+
import time
4+
from http.cookies import SimpleCookie
5+
from imageio import imread
6+
from wordcloud import WordCloud
7+
import jieba.analyse
8+
9+
10+
def word_segment(text):
11+
# 分词处理
12+
jieba.analyse.set_stop_words("./stopwords.txt")
13+
words = jieba.cut(text)
14+
from collections import Counter
15+
result = Counter(words).most_common(20)
16+
print(result)
17+
18+
tags = jieba.analyse.extract_tags(text, topK=20)
19+
print(tags)
20+
# yield " ".join(tags)
21+
22+
if __name__ == '__main__':
23+
text = open("text.txt", encoding="utf-8").read()
24+
word_segment(text)
25+
pass
26+
27+
28+
def word_cloud(texts):
29+
"""
30+
根据文本生成词云图片
31+
"""
32+
data = " ".join(text for text in texts)
33+
mask_img = imread('./python-logo.png', flatten=True)
34+
wordcloud = WordCloud(
35+
font_path='/Library/Fonts//华文黑体.ttf',
36+
background_color='white',
37+
mask=mask_img
38+
).generate(data)
39+
plt.imshow(wordcloud)
40+
plt.axis('off')
41+
plt.savefig('./wordcloud.jpg', dpi=600)
Collapse file

‎youbute_comment/__init__.py‎

Copy file name to clipboard
+36-31Lines changed: 36 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,16 @@
11
"""
22
获取youtube视频下的评论
3-
43
思路:
4+
基于youtube官方的API来获取, 这里是关于如何初始化配置的文档 https://developers.google.com/youtube/v3/getting-started
55
6-
基于youtube官方的API来获取, 这里是关于如何初始化配置的文档https://developers.google.com/youtube/v3/getting-started
7-
8-
接口文档:https://developers.google.com/youtube/v3/docs/channelSections/list
9-
10-
视频地址:https://www.youtube.com/watch?v=FWMIPukvdsQ
6+
评论接口文档:https://developers.google.com/youtube/v3/docs/channelSections/list
117
8+
任意视频地址:https://www.youtube.com/watch?v=FWMIPukvdsQ
129
"""
1310
import requests
1411

1512
# 在 API Console 配置生成
16-
key = "xxxxxx"
13+
key = "AIzaSyCtJuC7oMed0xxZYPcid913vPxOnl72sHg"
1714
# 视频ID
1815
videoId = "FWMIPukvdsQ"
1916

@@ -22,39 +19,47 @@
2219
f"textFormat=plainText&" \
2320
f"part=snippet&" \
2421
f"videoId={videoId}&" \
25-
f"maxResults=100"
22+
f"maxResults=100" # 分页参数
2623

2724
proxies = {
28-
'http': 'socks5://127.0.0.1:1080',
29-
'https': 'socks5://127.0.0.1:1080',
25+
'http': 'socks5://127.0.0.1:1080',
26+
'https': 'socks5://127.0.0.1:1080',
3027
}
3128

3229

33-
# 获取一下页的凭证
30+
def spider(next_page_token):
31+
if next_page_token:
32+
params = {"pageToken": next_page_token}
33+
else:
34+
params = None
35+
res = requests.get(url, proxies=proxies, params=params)
36+
data = res.json()
37+
import pprint
38+
next_page_token = data.get("nextPageToken")
39+
40+
items = data.get("items")
41+
for item in items:
3442

43+
comment = item.get("snippet").get("topLevelComment").get("snippet").get("textDisplay")
44+
print(comment)
45+
return next_page_token
3546

36-
def main():
37-
nextPageToken = "QURTSl9pMkR0YkxlcE1iOHhlLU1lNi1XWGhzTHdpUmlCN2w1UmJDNlVBaEhnT1dyejVFb3dnVGdWbExRSFNtMVRrNjE1TWVPWC04UVN2VGJrMkhjZ01KVmJpNllrRlVkdUFRWk1yVHp2cW1ZbjVNcXpFc2ZzRlI3ZkRlM3ZPUm1CalZSX1NkaE9qcEY4Tl8yUWRyMmN3"
38-
while nextPageToken is not None:
39-
if nextPageToken:
40-
params = {"pageToken": nextPageToken}
41-
else:
42-
params = None
43-
res = requests.get(url, proxies=proxies, params=params)
44-
data = res.json()
45-
import pprint
46-
nextPageToken = data.get("nextPageToken")
47-
print(nextPageToken)
48-
items = data.get("items")
49-
for item in items:
50-
comment = item.get("snippet").get("topLevelComment").get("snippet").get("textDisplay")
51-
print(comment)
52-
import time
53-
time.sleep(1)
54-
print("==================")
47+
def run():
48+
next_page_token = spider(None)
5549

50+
while next_page_token:
51+
try:
52+
print(next_page_token)
53+
next_page_token = spider(next_page_token)
54+
import time
55+
time.sleep(1)
56+
except Exception as e:
57+
# 请求超时重试
58+
import traceback
59+
print(next_page_token)
60+
print(traceback.format_exc())
5661

5762

5863

5964
if __name__ == '__main__':
60-
main()
65+
run()

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.