from urllib import request
from io import BytesIO
import gzip, re
class Spider():
url = 'https://www.douyu.com/g_qqfcsy' # 斗鱼QQ飞车手游分类页面
name_pattern = '<h2 class="DyListCover-user is-template"><svg><use xlink:href="#icon-user_05fb112"></use></svg>([\s\S]*?)</h2>'
hot_pattern = '<span class="DyListCover-hot is-template"><svg><use xlink:href="#icon-hot_635f5ef"></use></svg>([\s\S]*?)</span>'
def __fetch_content(self):
headers = {
"User-Agent": "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)",
}
req = request.Request(url=Spider.url, headers=headers)
r = request.urlopen(req)
htmls = r.read()
buff = BytesIO(htmls)
htmls = gzip.GzipFile(fileobj=buff)
htmls = htmls.read().decode('utf-8')
return htmls
def __analysis(self, htmls):
name_html = re.findall(Spider.name_pattern, htmls)
hot_html = re.findall(Spider.hot_pattern, htmls)
anchors = []
for i in range(len(name_html)):
anchor = {'name':name_html[i], 'number':hot_html[i]}
anchors.append(anchor)
return anchors
def __sort(self, anchors):
anchors = sorted(anchors, key = self.__sort_seed, reverse = True)
return anchors
def __sort_seed(self, anchor):
r = re.findall(r'\d+\.?\d*', anchor['number'])
number = float(r[0])
if '万' in anchor['number']:
number *= 10000
return number
def __show(self, anchors):
for rank in range(len(anchors)):
print(f"rank {str(rank + 1).ljust(3)} : {anchors[rank]['name'].ljust(20)} {anchors[rank]['number'].rjust(10)}")
def run(self):
htmls = self.__fetch_content()
anchors = self.__analysis(htmls)
anchors = self.__sort(anchors)
self.__show(anchors)
spider = Spider()
spider.run()
"""
rank 1 : DY丶皮皮炸 16.5万
rank 2 : 81丶叶大神 11.1万
rank 3 : 黑鲨神竞等风 9.9万
rank 4 : 丨北晨丨 8.8万
rank 5 : BN丶夏璐8 7.4万
rank 6 : 小恒很帅 6.3万
rank 7 : 无敌帆丶 5.9万
rank 8 : 花海Ayore 5.7万
rank 9 : wD丶计时坨坨 5万
rank 10 : 午夜锦年 4.9万
rank 11 : 哈杰J 4万
...
"""