代理

免费爬虫代理 IP 池:

https://github.com/jhao104/proxy_pool

这个代理池非常好用

只需要修改配置,跑起来,然后用下面的程序去获取内容即可

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
def get_proxy():
    return requests.get("http://127.0.0.1:5010/get/").json()

def delete_proxy(proxy):
    requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy))

def get_html(url: str, params: dict, timeout: int):
    while True:
        proxy = get_proxy().get("proxy")
        print(proxy)
        try:
            html = requests.get(url, params=params, proxies={"https": proxy}, timeout=timeout)
            print('success')
            return html.text
        except KeyboardInterrupt:
            break
        except Exception:
            delete_proxy(proxy)
            pass

https://github.com/cassieeric/python_crawler

https://github.com/cassieeric/python_crawler/tree/master/IP_proxy

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import requests
from bs4 import BeautifulSoup
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}

def get_ip(n):
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}
    url = 'http://www.89ip.cn/index_{0}.html'.format(n)
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    items = soup.find_all('td')
    for i, j in zip(items[::5], items[1::5]):
        ip = i.text.replace('\t', '').replace('\n', '') + ':' + j.text.replace('\n', '').replace('\t', '')
        proxies = {'https': ip}
        try:
            response = requests.get(
                'http://httpbin.org/get',
                headers=header,
                proxies=proxies,
                timeout=3
            )
            if response.status_code == 200:
                print(ip)
                return ip
        except:
            pass


def get_text(url: str, params: dict) -> str:
    n = 10
    ip = get_ip(n)
    proxies = {'https': ip}
    while True:
        try:
            res = requests.get(
                url, params=params, headers=header, proxies=proxies,timeout=5
            )
            text = res.text
            return text
        except KeyboardInterrupt:
            break
        except:
            n += 1
            ip = get_ip(n)  #上个ip被禁了,重新请求1个新ip
            proxies = {'https': ip}

使用:

1
2
3
4
5
try:
    res = requests.get(url, params=params, headers=header)
    res = res.text
except Exception:
    res = get_text(url, params)

或者都封装到 get_text 里:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
def get_text(url: str, params: dict) -> str:
    try:
        res = requests.get(url, params=params, headers=CUSTOM_HEADER)
        text = res.text
        return text
    except Exception:
        n = 10
        ip = get_ip(n)
        proxies = {'https': ip}
        while True:
            try:
                res = requests.get(
                    url, params=params, headers=CUSTOM_HEADER, proxies=proxies,timeout=5
                )
                text = res.text
                return text
            except KeyboardInterrupt:
                break
            except:
                n += 1
                ip = get_ip(n)  #上个ip被禁了,重新请求1个新ip
                proxies = {'https': ip}

也可以为 get_text 添加 timeout 参数来针对不同的内容