前言:
此时小伙伴们对“centossafari”大致比较重视,兄弟们都需要知道一些“centossafari”的相关资讯。那么小编同时在网络上网罗了一些有关“centossafari””的相关资讯,希望小伙伴们能喜欢,各位老铁们一起来学习一下吧!头条有多火?很火,超级火!抖音有多火?比头条有过之而无不及。据说抖音国际版在美国市场也被热捧,可以用一句话概括:犹如蝗虫所到之处,庄稼寸草不生。
>> 无私分享Python爬虫干货,如果你也想学习Python,@ 私信小编获取
很多人喜欢玩抖音,我也喜欢看抖音小姐姐,可如果拿着手机一条条去刷确实很耗时间,如果 Python 能帮忙筛选出颜值高的小姐姐那就省了很多事。
现在如果掌握了Python,就可以自动下载这些高颜值小姐姐的抖音视频,而且还都是高清无水印版本的呢!
环境说明
python 3.7.1
centos 7.4
pip 10.0.1
部署
[root@localhost ~]# python3.7 --version
Python 3.7.1
[root@localhost ~]#
[root@localhost ~]# pip3 install douyin
如果安装失败,重新执行上面的命令即可,直到安装完成。
设置HTTP代理
在学习Python爬虫的时候,经常会遇见所要爬取的网站采取了反爬取技术,高强度、高效率地爬取网页信息常常会给网站服务器带来巨大压力,所以同一个IP反复爬取同一个网页,就很可能被封。这里讲述一个爬虫技巧,设置极光代理IP。
from bs4 import BeautifulSoupimport requestsimport randomdef get_ip_list(url, headers):web_data = requests.get(url, headers=headers)soup = BeautifulSoup(web_data.text, 'lxml')ips = soup.find_all('tr')ip_list = []for i in range(1, len(ips)):ip_info = ips[i]tds = ip_info.find_all('td')ip_list.append(tds[1].text + ':' + tds[2].text)return ip_listdef get_random_ip(ip_list):proxy_list = []for ip in ip_list:proxy_list.append('; + ip)proxy_ip = random.choice(proxy_list)proxies = {'http': proxy_ip}return proxiesif __name__ == '__main__':url = ';headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}ip_list = get_ip_list(url, headers=headers)proxies = get_random_ip(ip_list)print(proxies)
运行上面的代码会得到一个随机的proxies,把它直接传入requests的get方法中即可。
web_data = requests.get(url, headers=headers, proxies=proxies)
用代理IP不仅可以隐藏自身IP,还可以防止自身IP被封锁。极光HTTP代理拥有海量IP,不仅使用方便快捷,更安全可靠。
导入抖音模块
[root@localhost ~]# python3.7
>>>import douyin
>>>
Python爬虫代码实现
#!/usr/bin/env python 3# -*- coding:utf-8 -*-from bs4 import BeautifulSoupfrom contextlib import closingimport requests, json, time, re, os, sys, timeimport numpy as npimport csvclass ScrapyDouYin(object):def __init__(self):"""抖音App视频下载"""# SSL认证passdef get_video_urls(self, user_id):"""获得视频播放地址Parameters:nickname:查询的用户名Returns:video_names: 视频名字列表video_urls: 视频链接列表aweme_count: 喜欢的视频数量"""video_names = []video_urls = []unique_id = ''while unique_id != user_id:search_url = '*1920&dpi=480&update_version_code=1622' % user_idreq = requests.get(url=search_url, verify=False)html = json.loads(req.text)aweme_count = html['user_list'][0]['user_info']['favoriting_count']uid = html['user_list'][0]['user_info']['uid']nickname = html['user_list'][0]['user_info']['nickname']unique_id = html['user_list'][0]['user_info']['unique_id']user_url = '; % (uid, aweme_count)req = requests.get(url=user_url, verify=False)html = json.loads(req.text)i = 1user_id_lst = []for each in html['aweme_list']:share_desc = each['share_info']['share_desc']##视频作者的抖音号user_id_lst.append(each['author']['short_id'])if '抖音-原创音乐短视频社区' == share_desc:video_names.append(str(i) + '.mp4')i += 1else:video_names.append(share_desc + '.mp4')video_urls.append(each['share_info']['share_url'])user_id_lst = list(np.unique(user_id_lst))return video_names, video_urls, nickname, user_id_lstdef get_download_url(self, video_url):"""获得视频播放地址Parameters:video_url:视频播放地址Returns:download_url: 视频下载地址"""req = requests.get(url=video_url, verify=False)bf = BeautifulSoup(req.text, 'lxml')script = bf.find_all('script')[-1]video_url_js = re.findall('var data = \[(.+)\];', str(script))[0]video_html = json.loads(video_url_js)download_url = video_html['video']['play_addr']['url_list'][0]return download_urldef video_downloader(self, video_url, video_name):"""视频下载Parameters:NoneReturns:None"""size = 0with closing(requests.get(video_url, stream=True, verify=False)) as response:chunk_size = 1024content_size = int(response.headers['content-length'])if response.status_code == 200:sys.stdout.write(' [文件大小]:%0.2f MB\n' % (content_size / chunk_size / 1024))with open(video_name, "wb") as file:for data in response.iter_content(chunk_size=chunk_size):file.write(data)size += len(data)file.flush()sys.stdout.write(' [下载进度]:%.2f%%' % float(size / content_size * 100))sys.stdout.flush()time.sleep(1)def run(self):"""运行函数Parameters:NoneReturns:None"""self.hello()# user_id = input('请输入ID(例如173553803):')user_id = 'lxylky1688'video_names, video_urls, nickname, user_id_lst = self.get_video_urls(user_id)if nickname not in os.listdir():os.mkdir(nickname)sys.stdout.write('视频下载中:\n')for num in range(len(video_urls)):print(' %s\n' % video_urls[num])video_url = self.get_download_url(video_urls[num])if '\\' in video_names[num]:video_name = video_names[num].replace('\\', '')elif '/' in video_names[num]:video_name = video_names[num].replace('/', '')else:video_name = video_names[num]self.video_downloader(video_url, os.path.join(nickname, video_name))print('')csvFile = open('get_favorite_id.txt', 'w+')try:writer = csv.writer(csvFile)writer.writerow(user_id_lst)finally:csvFile.close()def hello(self):"""打印欢迎界面Parameters:NoneReturns:None"""print('*' * 100)print('\t\t\t\t抖音App视频下载小助手')print('*' * 100)if __name__ == '__main__':Scrapydouyin = ScrapyDouYin()Scrapydouyin.run()最后,成功获取女神们的全部视频...
为了帮助大家更轻松的学好Python,我给大家分享一套Python学习资料,@ 私信小编领取PS:如果觉得本篇文章对您有所帮助,欢迎关注、收藏、转发( ´・ω・)ノ(._.`)
标签: #centossafari