Python 爬虫 ip 被封,公司给了个付费快代理接口,我先验证付费接口 ip 的可用性,然后拿来爬目标网站,还是出现

This topic created in 2940 days ago, the information mentioned may be changed or developed.

import requests from lxml import etree import time, random from random import choice

def get_proxy(): url = 'http://svip.kuaidaili.com/api/getproxy/?orderid=&num=1&b_pcchrome=1&b_pcie=1&b_pcff=1&protocol=1&method=2&an_an=1&an_ha=1&quality=2&sep=1' proxy_temp = requests.get(url=url, timeout=1).text

proxy = {'http':'http://{}'.format(proxy_temp)}
if requests.get(url='http://nj.58.com/chuzu/?key=%E7%A7%9F%E6%88%BF', proxies=proxy).status_code == 200:
    return proxy
else:
    get_proxy()

def crawl(): frist_url = 'http://nj.58.com/chuzu'

headers = [{'User-Agent':'Mozilla/5.0(Macintosh;U;IntelMacOSX10_6_8;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50'}, {'User-Agent':'Mozilla/5.0(Windows;U;WindowsNT6.1;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50'}, {'User-Agent':'Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.1;Trident/5.0;'}, {'User-Agent':'Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1'}, {'User-Agent':'Opera/9.80(WindowsNT6.1;U;en)Presto/2.8.131Version/11.11'}, {'User-Agent':'Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;Maxthon2.0)'}, {'User-Agent':'Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;360SE)'}, {'User-Agent':'Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/17.0.963.56Safari/535.11'}]

s = requests.session()
s.keep_alive = False

try:
    resp = requests.get(url=frist_url, timeout=0.5).text

except requests.exceptions.RequestException as e:
    print(e)



attr = etree.HTML(resp)

max_page = attr.xpath('//div[@class="pager"]/a/span/text()')[-2]

for page in range(1, int(max_page)+1):

    next_url = frist_url + "/pn" + str(page)

    response = requests.get(url=next_url, proxies=get_proxy(), timeout=1, headers=random.choice(headers)).text

    attr = etree.HTML(response)

    detail_urls = attr.xpath('//ul[@class="listUl"]/li/div[@class="img_list"]/a/@href')

    for detail_url in detail_urls:

        time.sleep(random.random()*3)

        try:
            s = requests.session()
            s.keep_alive = False

            r = requests.get(url=detail_url, proxies=get_proxy(), timeout=1, headers=random.choice(headers)).text

        except requests.exceptions.RequestException as e:

            print(e)

        html = etree.HTML(r)

        if "pinpaigongyu" in detail_url:
            phone = str(html.xpath('//div[@class="phonenum getPrivateCallBtnStyle"]/text()'))
            rent_type = html.xpath('//div[@class="housedetail center cf"]/h2/text()')[0].split()[0].split('] ')[0].split(' [')[1]
            area = html.xpath('//ul[@class="house-info-list"]/li[1]/span/text()')[0].split()[0]+"平"
            room_type = html.xpath('//ul[@class="house-info-list"]/li[2]/span/text()')[0].split()[0]
            addres = html.xpath('//ul[@class="house-info-list"]/li[4]/span/text()')[0].strip()
            traffic = str(html.xpath('//ul[@class="house-info-list"]/li[5]/span/text()'))
            pictures = html.xpath('//ul[@id="pic-list"]/li/img/@lazy_src')
            house_description = html.xpath('//p[@id="desc"]/text()')[0].replace(' ','')
            print(phone)




        else:
            phone = str(html.xpath('//div[@class="house-chat-phonenum"]/p[@class="phone-num"]/text()'))
            rent_type = html.xpath('//ul[@class="f14"]/li[1]/span[2]/text()')[0].split('-')[0]
            area = html.xpath('//ul[@class="f14"]/li[2]/span[2]/text()')[0].split()[1]+"平"
            room_type = html.xpath('//ul[@class="f14"]/li[2]/span[2]/text()')[0].split()[0]
            addres = html.xpath('//ul[@class="f14"]/li[6]/span[2]/text()')[0].strip()
            traffic = str(html.xpath('//ul[@class="f14"]/li[5]/em/text()'))
            pictures = html.xpath('//ul[@id="housePicList"]/li/img/@lazy_src')  
            house_description = str(html.xpath('//ul[@class="introduce-item"]/li[2]/span[@class="a2"]//text()')).strip()
            print(phone)

if name == 'main': crawl()

1 replies • 2018-06-19 20:24:34 +08:00

U87

Jun 19, 2018

还是出现 requests.exceptions.ProxyError 难道是在验证和爬目标网站这时间之间 ip 失效了?