前言:
目前咱们对“正则匹配img标签”大体比较关注,姐妹们都需要分析一些“正则匹配img标签”的相关内容。那么小编在网摘上网罗了一些关于“正则匹配img标签””的相关资讯,希望咱们能喜欢,朋友们一起来学习一下吧!#!/usr/bin/env python
# coding=utf-8
# author:Charles
# datetime:2018/03/23/0004 15:26
# software: meizitu
import os, requests, time, re
class JanDan(object):
def __init__(self):
# 从这里开始
self.url = ''
# 请求头一定要加上referer证明你是从那里来的,否则抓取不到图片
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
}
# 每个图片的反爬机制
self.headersa = {
'Connection': 'keep-alive',
'Host': 'ws3.sinaimg.cn',
'Upgrade-Insecure-Requests': '1',
'referer': 'http: //jandan.net/ooxx/',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
}
# 封装get方法
def geta(self, url, params=None, header=None):
session = requests.session()
ret = {}
ret['success'] = False
try:
if params:
session.params = params
if header:
session.headers = header
msg = session.get(url)
if msg:
ret['success'] = True
ret['content'] = msg.content
except Exception, e:
print e.message
finally:
if session:
session.close()
return ret
# 获取每个页面的url
def get_url_all(self, starts, stop, wenjian):
start = time.time()
for p in xrange(starts, stop + 1):
urls = self.url + '/' + 'page-%s#comments' % p
url_all_resp = self.geta(url=urls, header=self.headers)
if url_all_resp['success'] == False:
return False
url_all_html = url_all_resp['content']
url_all_htmls = url_all_html.replace('\r', '').replace('\t', '').replace('\n', '')
url_all_list = re.findall(
'class="righttext">(.*?)<p><a href="(.*?)" target="_blank" class="view_img_link">(.*?)</p>',
url_all_htmls)
end = time.time()
# 计算时间
self.t1 = end - start
print '获取第----%d----页所有的url完成,耗时%f秒' % (p, self.t1)
# 返回所有小类的url列表
self.get_img_url(url_all_list)
end_time = self.t1 + self.t2
print '第----%d----页写入完成' % p
print '全站图片---第%d页至第%d页---煎蛋图写入完成,总耗时%f秒.......' % (starts, stop, end_time)
# 获取每个图片的url
def get_img_url(self, PageUrlList):
start = time.time()
num = 1
# 遍历出每个页码的url,发送请求返回数据,正则匹配出图片的url,下载到本地
for page_img_url in PageUrlList:
urls = 'http:%s' % page_img_url[1]
page_img_html = self.geta(url=urls, header=self.headersa)
if page_img_html['success'] == True:
tupian = page_img_html['content']
with open('%s' % (page_img_url[1][-36:]), 'wb') as f:
f.write(tupian)
print '第%d张煎蛋图写入完成...' % num
num += 1
end = time.time()
self.t2 = end - start
if __name__ == '__main__':
wenjianjai = raw_input('请输入存放的文件夹名字:')
# 创建文件夹
path = 'D:/%s/' % wenjianjai
os.makedirs(path)
# 进入创建的文件夹
os.chdir(path)
num1 = int(raw_input('请输入起始页码:'))
num2 = int(raw_input('请输入结束页码:'))
JanDan().get_url_all(num1, num2, wenjianjai)
标签: #正则匹配img标签