diff --git a/razzl/0008/0008.py b/razzl/0008/0008.py new file mode 100644 index 00000000..542efcca --- /dev/null +++ b/razzl/0008/0008.py @@ -0,0 +1,19 @@ +from __future__ import division#division +import re +import urllib2 + +url = 'http://world.cankaoxiaoxi.com/2015/0404/730644.shtml' +html = urllib2.urlopen(url).read() +html = re.sub(r']*>([\s\S])*?]*>','',html)#delete the script +html = re.sub(r']*>([\s\S])*?]*>','',html)#delete the style +html = re.split("[\r\n]+",html)#split +for line in html: + if line.strip()=='': + continue + line_sub = re.sub(r'<[^>]*>','',line)#record the words in a line + if len(line_sub)/len(line) >= 0.5:#compare the text of the density + if(line_sub.strip()!=''): + print line_sub.strip() + + + diff --git a/razzl/0009/0009.py b/razzl/0009/0009.py new file mode 100644 index 00000000..3108f6b3 --- /dev/null +++ b/razzl/0009/0009.py @@ -0,0 +1,9 @@ +import re +import urllib2 + +url = 'http://www.cnblogs.com/jasondan/p/3497757.html' +html = urllib2.urlopen(url).read() +links = re.findall(r'<[^>]+src="([^>]+)"[^>]*>|<[^>]+href="([^>]+)"[^>]*>',html)#find the link +for link in links: + for lin in link: + print lin diff --git a/razzl/0013/0013.py b/razzl/0013/0013.py new file mode 100644 index 00000000..efc58185 --- /dev/null +++ b/razzl/0013/0013.py @@ -0,0 +1,13 @@ +import urllib2 +import urllib +import re + +url = 'http://tieba.baidu.com/p/2166231880' +html = urllib2.urlopen(url).read() + +photos = re.findall(r'class="BDE_Image" src="([^"]+)"',html) +i=0 +for photo in photos: + urllib.urlretrieve(photo,'C:/Users/zzl/Desktop/1/'+str(i)+'.jpg') + i += 1 +