Skip to content

urllib练习

获取当前斗鱼QQ飞车手游主播排行榜

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from urllib import request
from io import BytesIO
import gzip, re

class Spider():
    url = 'https://www.douyu.com/g_qqfcsy' # 斗鱼QQ飞车手游分类页面
    name_pattern = '<h2 class="DyListCover-user is-template"><svg><use xlink:href="#icon-user_05fb112"></use></svg>([\s\S]*?)</h2>'
    hot_pattern = '<span class="DyListCover-hot is-template"><svg><use xlink:href="#icon-hot_635f5ef"></use></svg>([\s\S]*?)</span>'

    def __fetch_content(self):
        headers = {
            "User-Agent": "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)",
        }
        req = request.Request(url=Spider.url, headers=headers)
        r = request.urlopen(req)
        htmls = r.read()
        buff = BytesIO(htmls)
        htmls = gzip.GzipFile(fileobj=buff)
        htmls = htmls.read().decode('utf-8')
        return htmls

    def __analysis(self, htmls):
        name_html = re.findall(Spider.name_pattern, htmls)
        hot_html = re.findall(Spider.hot_pattern, htmls)

        anchors = []
        for i in range(len(name_html)):
            anchor = {'name':name_html[i], 'number':hot_html[i]}
            anchors.append(anchor)

        return anchors

    def __sort(self, anchors):
        anchors = sorted(anchors, key = self.__sort_seed, reverse = True)
        return anchors

    def __sort_seed(self, anchor):
        r = re.findall(r'\d+\.?\d*', anchor['number'])
        number = float(r[0])
        if '万' in anchor['number']:
            number *= 10000
        return number

    def __show(self, anchors):
        for rank in range(len(anchors)):
            print(f"rank {str(rank + 1).ljust(3)} : {anchors[rank]['name'].ljust(20)} {anchors[rank]['number'].rjust(10)}")

    def run(self):
        htmls = self.__fetch_content()
        anchors = self.__analysis(htmls)
        anchors = self.__sort(anchors)
        self.__show(anchors)

spider = Spider()
spider.run()
"""
rank 1   : DY丶皮皮炸                    16.5万
rank 2   : 81丶叶大神                    11.1万
rank 3   : 黑鲨神竞等风                     9.9万
rank 4   : 丨北晨丨                       8.8万
rank 5   : BN丶夏璐8                     7.4万
rank 6   : 小恒很帅                       6.3万
rank 7   : 无敌帆丶                       5.9万
rank 8   : 花海Ayore                    5.7万
rank 9   : wD丶计时坨坨                      5万
rank 10  : 午夜锦年                       4.9万
rank 11  : 哈杰J                          4万
...
"""