龙空技术网

Requests爬取煎蛋网妹子图片(老司机第二弹)

阿毛杂记 240

前言:

目前咱们对“正则匹配img标签”大体比较关注,姐妹们都需要分析一些“正则匹配img标签”的相关内容。那么小编在网摘上网罗了一些关于“正则匹配img标签””的相关资讯,希望咱们能喜欢,朋友们一起来学习一下吧!

#!/usr/bin/env python

# coding=utf-8

# author:Charles

# datetime:2018/03/23/0004 15:26

# software: meizitu

import os, requests, time, re

class JanDan(object):

def __init__(self):

# 从这里开始

self.url = ''

# 请求头一定要加上referer证明你是从那里来的,否则抓取不到图片

self.headers = {

'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'

}

# 每个图片的反爬机制

self.headersa = {

'Connection': 'keep-alive',

'Host': 'ws3.sinaimg.cn',

'Upgrade-Insecure-Requests': '1',

'referer': 'http: //jandan.net/ooxx/',

'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'

}

# 封装get方法

def geta(self, url, params=None, header=None):

session = requests.session()

ret = {}

ret['success'] = False

try:

if params:

session.params = params

if header:

session.headers = header

msg = session.get(url)

if msg:

ret['success'] = True

ret['content'] = msg.content

except Exception, e:

print e.message

finally:

if session:

session.close()

return ret

# 获取每个页面的url

def get_url_all(self, starts, stop, wenjian):

start = time.time()

for p in xrange(starts, stop + 1):

urls = self.url + '/' + 'page-%s#comments' % p

url_all_resp = self.geta(url=urls, header=self.headers)

if url_all_resp['success'] == False:

return False

url_all_html = url_all_resp['content']

url_all_htmls = url_all_html.replace('\r', '').replace('\t', '').replace('\n', '')

url_all_list = re.findall(

'class="righttext">(.*?)<p><a href="(.*?)" target="_blank" class="view_img_link">(.*?)</p>',

url_all_htmls)

end = time.time()

# 计算时间

self.t1 = end - start

print '获取第----%d----页所有的url完成,耗时%f秒' % (p, self.t1)

# 返回所有小类的url列表

self.get_img_url(url_all_list)

end_time = self.t1 + self.t2

print '第----%d----页写入完成' % p

print '全站图片---第%d页至第%d页---煎蛋图写入完成,总耗时%f秒.......' % (starts, stop, end_time)

# 获取每个图片的url

def get_img_url(self, PageUrlList):

start = time.time()

num = 1

# 遍历出每个页码的url,发送请求返回数据,正则匹配出图片的url,下载到本地

for page_img_url in PageUrlList:

urls = 'http:%s' % page_img_url[1]

page_img_html = self.geta(url=urls, header=self.headersa)

if page_img_html['success'] == True:

tupian = page_img_html['content']

with open('%s' % (page_img_url[1][-36:]), 'wb') as f:

f.write(tupian)

print '第%d张煎蛋图写入完成...' % num

num += 1

end = time.time()

self.t2 = end - start

if __name__ == '__main__':

wenjianjai = raw_input('请输入存放的文件夹名字:')

# 创建文件夹

path = 'D:/%s/' % wenjianjai

os.makedirs(path)

# 进入创建的文件夹

os.chdir(path)

num1 = int(raw_input('请输入起始页码:'))

num2 = int(raw_input('请输入结束页码:'))

JanDan().get_url_all(num1, num2, wenjianjai)

标签: #正则匹配img标签