正则表达式获取贴吧访问量
import urllib.requestimport redata=urllib.request.urlopen("https://tieba.baidu.com/f?kw=cpda&fr=ala0&tpl=5").read()data2=data.decode("utf-8","ignore")pat="(.*?) "s1 = re.compile(pat).findall(str(data2))print(s1)pat2='(.*?)'s2 = re.compile(pat2).findall(str(data2))print(s2)pat3='(.*?)'s3 = re.compile(pat3).findall(str(data2))print(s3)pat4='(.*?)'s4 = re.compile(pat4).findall(str(data2))print(s4)
正则表达式学习2--豆瓣获取文章
import urllib.requestimport refile=urllib.request.urlopen("https://read.douban.com/provider/all").read()file2=file.decode("utf-8","ignore")patn='(.*?)'mydata=re.compile(patn).findall(str(file2))print(mydata)for i in range(0,len(mydata)): print(mydata[i]+"\n")
url数据获取--异常值处理--新浪新闻获取文章
import urllib.requestimport redata=urllib.request.urlopen("http://news.sina.com.cn/").read()data2=data.decode("utf-8","ignore")pat='href="(http://news.sina.com.cn/.*?)"'allurl=re.compile(pat).findall(data2)for i in range(0,len(allurl)): try: print("第"+str(i)+"次爬取") thisurl=allurl[i] print(thisurl) file="D:/sinanews/"+str(i)+".html" print(file) print("-------成功-------") except urllib.error.URLError as e: if hasattr(e,"code"): print(e.code) if hasattr(e,"reason"): print(e.reason)
import urllib.requestimport refile=urllib.request.urlopen("https://read.douban.com/provider/all").read()file2=file.decode("utf-8","ignore")patn='<div class="name">(.*?)</div>'mydata=re.compile(patn).findall(str(file2))print(mydata)for i in range(0,len(mydata)): print(mydata[i]+"\n")