前言:
此时我们对“php代理ip池”大概比较关怀,姐妹们都想要分析一些“php代理ip池”的相关知识。那么小编也在网摘上收集了一些有关“php代理ip池””的相关内容,希望兄弟们能喜欢,我们一起来了解一下吧!UA代理池和IP代理池
1. UA代理池
UA代理池也称作user-agent代理池,目的是在http头部加入user-agent选项,模拟浏览器进行发包给服务器端,起到伪装作用。也是很重要的一种反爬策略之一。无私分享全套Python爬虫干货,如果你也想学习Python,@ 私信小编获取
从预先定义的user-agent的列表中随机选择一个来采集不同的页面
在settings.py中添加以下代码:
DOWNLOADER_MIDDLEWARES = { 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware' : None, 'qiubai_proj.middlewares.RotateUserAgentMiddleware' :400,}
settings.py中添加USER_AGENT_LIST的配置
USER_AGENT_LIST = [ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
在middlewares文件里添加代理中间件类RotateUserAgentMiddleware
import random#from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddlewarefrom scrapy.downloadermiddlewares.useragent import UserAgentMiddlewarefrom settings import USER_AGENT_LISTclass RotateUserAgentMiddleware(UserAgentMiddleware): ''' 用户代理中间件(处于下载中间件位置) ''' def process_request(self, request, spider): user_agent = random.choice(USER_AGENT_LIST) if user_agent: request.headers.setdefault('User-Agent', user_agent) print(f"User-Agent:{user_agent}")
用 Python 爬取网站内容的时候,容易受到反爬虫机制的限制,而突破反爬虫机制的一个重要措施就是使用IP代理。我们可以在网络上找到许多IP代理,但稳定的IP代理成本都较高。因此利用免费代理构建自己的代理池就非常有必要了。
浏览器伪装一下才能爬取,使用requests库进行改造
# -*- coding: utf-8 -*- __author__ = 'zhougy'__date__ = '2018/9/7 下午2:32' import timeimport requestsimport threadingfrom threading import Lockimport queueg_lock = Lock()n_thread = 10headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko)" " Chrome/68.0.3440.106 Safari/537.36",}def fetch_web_data(url, proxy=None, timeout=10): try: r = requests.get(url, timeout=timeout, headers=headers, proxies=proxy) data = r.text return data except Exception as e: print(f"fetch_web-data has error url: {url}") return Nonedef write_ip_pair(ip_pair): ''' 将可用的IP和端口动态持久化到proxy_ip_list_日期.txt文件中 :param ip_pair: :return: ''' proxy_file_name = "proxy_ip_list_%s.txt" % (time.strftime("%Y.%m.%d", time.localtime(time.time()))) with open(proxy_file_name, "a+", encoding="utf-8") as f: f.write(f"{ip_pair}\n")#def write_ip(ip_port_pair):class IpProxyCheckThread(threading.Thread): def __init__(self, queue): threading.Thread.__init__(self) self.__queue = queue def run(self): global g_lock while True: data = self.__queue.get() ip_port_pair = data.split(",")[0] print(f"the check ip is {ip_port_pair} ") proxy = { "http":ip_port_pair, } url = "; data = fetch_web_data(url, proxy=proxy, timeout=15) if data == None: print(f"当前ip {ip_port_pair} 校验不成功,丢弃!") continue print(f"当前ip {ip_port_pair} 校验成功,可用!") g_lock.acquire() write_ip_pair(ip_port_pair) g_lock.release()class FetchProxyListThread(threading.Thread): def __init__(self, url, mq): threading.Thread.__init__(self) self.__url = url self.__mq = mq def run(self): data = fetch_web_data(self.__url) print(data) ip_pool_list = data.split("\n") [self.__mq.put(ip_pool) for ip_pool in ip_pool_list]def process(): mq = queue.Queue() thread_list = [] for i in range(n_thread): t = IpProxyCheckThread(mq) t.setDaemon(True) thread_list.append(t) [t.start() for t in thread_list] url = "; fth = FetchProxyListThread(url, mq) fth.start() fth.join() [t.join() for t in thread_list] mq.join()if __name__ == "__main__": process()scrapy中加入IP代理池
由day05节,讲述了如何得到网络连接良好的一组ip
(1) 将之前过滤出来的可用的一组IP和端口放入一个列表中(可以读取文件,加载到list中)
创建一个my_proxies.py文件,内容大致如下
PROXY =["187.65.49.137:3128","108.61.246.98:8088","167.99.197.73:8080",]
(2)在middlewares.py文件中加入IP代理池中间件
import randomfrom . import my_proxiesclass MyProxyMidleware(object): def process_request(self, request, spider): request.meta['proxy'] = random.choice(my_proxies.PROXY)
(3)在配置文件中加入添加映射关系
DOWNLOADER_MIDDLEWARES = { 'qiubai_proj.middlewares.MyProxyMidleware':300,}
(4)启动scrapy,查看IP代理池效果
scrapy crawl qiubai
为了帮助大家更轻松的学好Python,我给大家分享一套Python学习资料,希望对正在学习的你有所帮助!
获取方式:关注并私信小编 “ 学习 ”,即可免费获取!
标签: #php代理ip池