前言:
今天大家对“爬虫代码python”可能比较看重,大家都想要了解一些“爬虫代码python”的相关知识。那么小编在网摘上网罗了一些有关“爬虫代码python””的相关内容,希望各位老铁们能喜欢,各位老铁们快快来学习一下吧!作者:DataCastle
来源:知乎
第一节:下载百度首页信息
import requestsdata = requests.get('')data.encoding='utf-8'print(data.text)
第二节:Requsts+Xpath 爬取豆瓣电影
1.爬取单个元素信息
import requestsfrom lxml import etreeurl = ''data = requests.get(url).texts=etree.HTML(data)file=s.xpath('//*[@id="content"]/h1/span[1]/text()')print(file)
2.爬取多个元素信息
import requestsfrom lxml import etreeurl = ''data = requests.get(url).texts=etree.HTML(data)film=s.xpath('//*[@id="content"]/h1/span[1]/text()')director=s.xpath('//*[@id="info"]/span[1]/span[2]/a/text()')actor=s.xpath('//*[@id="info"]/span[3]/span[2]/a/text()')time=s.xpath('//*[@id="info"]/span[13]/text()')print('电影名称:',film)print('导演:',director)print('主演:',actor)print('片长:',time)
第四节:爬取豆瓣图书TOP250信息
from lxml import etreeimport requestsimport timefor a in range(10): url = '{}'.format(a*25) data = requests.get(url).text s=etree.HTML(data) file=s.xpath('//*[@id="content"]/div/div[1]/div/table') time.sleep(3) for div in file: title = div.xpath("./tr/td[2]/div[1]/a/@title")[0] href = div.xpath("./tr/td[2]/div[1]/a/@href")[0] score=div.xpath("./tr/td[2]/div[2]/span[2]/text()")[0] num=div.xpath("./tr/td[2]/div[2]/span[3]/text()")[0].strip("(").strip().strip(")").strip() scrible=div.xpath("./tr/td[2]/p[2]/span/text()") if len(scrible) > 0: print("{},{},{},{},{}\n".format(title,href,score,num,scrible[0])) else: print("{},{},{},{}\n".format(title,href,score,num))
第五节:爬取小猪短租房屋信息
from lxml import etreeimport requestsimport timefor a in range(1,6): url = ''.format(a) data = requests.get(url).text s=etree.HTML(data) file=s.xpath('//*[@id="page_list"]/ul/li') time.sleep(3) for div in file: title=div.xpath("./div[2]/div/a/span/text()")[0] price=div.xpath("./div[2]/span[1]/i/text()")[0] scrible=div.xpath("./div[2]/div/em/text()")[0].strip() pic=div.xpath("./a/img/@lazy_src")[0] print("{} {} {} {}\n".format(title,price,scrible,pic))
第六节:将爬取的数据存到本地
1.存储小猪短租数据
from lxml import etreeimport requestsimport timewith open('/Users/mac/Desktop/xiaozhu.csv','w',encoding='utf-8') as f: for a in range(1,6): url = '{}-0/'.format(a) data = requests.get(url).text s=etree.HTML(data) file=s.xpath('//*[@id="page_list"]/ul/li') time.sleep(3) for div in file: title=div.xpath("./div[2]/div/a/span/text()")[0] price=div.xpath("./div[2]/span[1]/i/text()")[0] scrible=div.xpath("./div[2]/div/em/text()")[0].strip() pic=div.xpath("./a/img/@lazy_src")[0] f.write("{},{},{},{}\n".format(title,price,scrible,pic))
2.存储豆瓣图书TOP250数据
from lxml import etreeimport requestsimport timewith open('/Users/mac/Desktop/top250.csv','w',encoding='utf-8') as f: for a in range(10): url = '{}'.format(a*25) data = requests.get(url).text s=etree.HTML(data) file=s.xpath('//*[@id="content"]/div/div[1]/div/table') time.sleep(3) for div in file: title = div.xpath("./tr/td[2]/div[1]/a/@title")[0] href = div.xpath("./tr/td[2]/div[1]/a/@href")[0] score=div.xpath("./tr/td[2]/div[2]/span[2]/text()")[0] num=div.xpath("./tr/td[2]/div[2]/span[3]/text()")[0].strip("(").strip().strip(")").strip() scrible=div.xpath("./tr/td[2]/p[2]/span/text()") if len(scrible) > 0: f.write("{},{},{},{},{}\n".format(title,href,score,num,scrible[0])) else: f.write("{},{},{},{}\n".format(title,href,score,num))
第七节:爬取豆瓣分类电影信息,解决动态加载页面
import requestsimport jsonimport timefor a in range(3): url_visit = ';range=0,10&tags=&start={}'.format(a*20) file = requests.get(url_visit).json() #这里跟之前的不一样,因为返回的是 json 文件 time.sleep(2) for i in range(20): dict=file['data'][i] #取出字典中 'data' 下第 [i] 部电影的信息 urlname=dict['url'] title=dict['title'] rate=dict['rate'] cast=dict['casts'] print('{} {} {} {}\n'.format(title,rate,' '.join(cast),urlname))
版权声明:
本站文章均来自互联网搜集,如有侵犯您的权益,请联系我们删除,谢谢。
标签: #爬虫代码python