代理
免费爬虫代理 IP 池:
https://github.com/jhao104/proxy_pool
这个代理池非常好用
只需要修改配置,跑起来,然后用下面的程序去获取内容即可
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 | def get_proxy():
return requests.get("http://127.0.0.1:5010/get/").json()
def delete_proxy(proxy):
requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy))
def get_html(url: str, params: dict, timeout: int):
while True:
proxy = get_proxy().get("proxy")
print(proxy)
try:
html = requests.get(url, params=params, proxies={"https": proxy}, timeout=timeout)
print('success')
return html.text
except KeyboardInterrupt:
break
except Exception:
delete_proxy(proxy)
pass
|
https://github.com/cassieeric/python_crawler
https://github.com/cassieeric/python_crawler/tree/master/IP_proxy
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45 | import requests
from bs4 import BeautifulSoup
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}
def get_ip(n):
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}
url = 'http://www.89ip.cn/index_{0}.html'.format(n)
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
items = soup.find_all('td')
for i, j in zip(items[::5], items[1::5]):
ip = i.text.replace('\t', '').replace('\n', '') + ':' + j.text.replace('\n', '').replace('\t', '')
proxies = {'https': ip}
try:
response = requests.get(
'http://httpbin.org/get',
headers=header,
proxies=proxies,
timeout=3
)
if response.status_code == 200:
print(ip)
return ip
except:
pass
def get_text(url: str, params: dict) -> str:
n = 10
ip = get_ip(n)
proxies = {'https': ip}
while True:
try:
res = requests.get(
url, params=params, headers=header, proxies=proxies,timeout=5
)
text = res.text
return text
except KeyboardInterrupt:
break
except:
n += 1
ip = get_ip(n) #上个ip被禁了,重新请求1个新ip
proxies = {'https': ip}
|
使用:
| try:
res = requests.get(url, params=params, headers=header)
res = res.text
except Exception:
res = get_text(url, params)
|
或者都封装到 get_text 里:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 | def get_text(url: str, params: dict) -> str:
try:
res = requests.get(url, params=params, headers=CUSTOM_HEADER)
text = res.text
return text
except Exception:
n = 10
ip = get_ip(n)
proxies = {'https': ip}
while True:
try:
res = requests.get(
url, params=params, headers=CUSTOM_HEADER, proxies=proxies,timeout=5
)
text = res.text
return text
except KeyboardInterrupt:
break
except:
n += 1
ip = get_ip(n) #上个ip被禁了,重新请求1个新ip
proxies = {'https': ip}
|
也可以为 get_text 添加 timeout 参数来针对不同的内容