话不多说直接上代码,都有注释,如果有不懂的可以提出来或者有更好方案也可以提出来,大家一起学习。

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

#!/usr/bin/env python3

# -*- coding: utf-8 -*-

import requests

import urllib.parse

from lxml import etree

import re

import os

# 自定义错误

class Error(Exception):

    def __init__(self, message):

        self.message = message

class Wallpaper:

    """

    0x0 所有分辨率

    1920x1080

    """

    def __init__(self, url, path, page, min_resolution):

        self.url = url

        self.path = path

        self.params = {

            "page": page

        }

        self.min_resolution = min_resolution

        # post请求参数

        self.data = {

            # 分页

            "view""paged",

            # 分辨率

            "min_resolution": min_resolution,

            # 分辨率等于还是至少是 可以大于等于

            "resolution_equals""=",

            # 排序方法

            # newest 最新上传

            # rating 最多赞数

            "sort""newest",

        }

    # 创建文件夹

    @staticmethod

    def create_dir(path):

        if not os.path.exists(path):

            os.makedirs(path)

        else:

            return "文件夹已存在"

    # 返回图片类型的文件夹名称, 根据链接的不同类型

    def get_folder_name(self):

        # 标签链接名称获取 https://wall.alphacoders.com/tag/ahri-(league-of-legends)-wallpapers?lang=Chinese

        pattern = r"tag/([\w-]+)-(.*)\?lang=Chinese"

        match = re.search(pattern, self.url)

        if match:

            content1 = match.group(1)  # 获取第一个捕获组的内容

            content2 = match.group(2)  # 获取第二个捕获组的内容

            image_dir_name = content1 + content2

            return image_dir_name

        # 分类链接名称获取 https://wall.alphacoders.com/by_sub_category.php?id=169908&name=%E8%8B%B1%E9%9B%84%E8%81%94%E7%9B%9F+%E5%A3%81%E7%BA%B8&lang=Chinese

        elif not match:

            try:

                params = {}

                url_params = self.url.split("?")[1]

                temp = url_params.split("&")

                for param in temp:

                    key = param.split("=")[0]

                    value = param.split("=")[1]

                    params[key] = value

                name = params.get("name"None)

                # 获取分类名称

                if name is not None:

                    image_dir_name = urllib.parse.unquote(name.split("+")[0])

                    return image_dir_name

                # 获取不到名称名称,那链接中的就是这种类型 https://wall.alphacoders.com/by_resolution.php?w=3840&h=2160&lang=Chinese

                else:

                    = params.get("w")

                    = params.get("h")

                    image_dir_name = + "x" + h

                    return image_dir_name

            # 获取索引名称 https://wall.alphacoders.com/search.php?search=landscape&lang=Chinese

            except AttributeError:

                pattern = r"search=([^&]+)&lang=Chinese"

                match = re.search(pattern, self.url)

                if match:

                    image_dir_name = match.group(1)

                    return image_dir_name

    # 获取每一页的所有图片页面的图片链接

    def get_image_urls(self):

        url = "https://wall.alphacoders.com"

        response = requests.post(self.url, params=self.params, data=self.data, allow_redirects=False)

        # 超出页数会重定向到最大的页面,进行判断,防止重复爬取

        if response.status_code == 200:

            html = etree.HTML(response.text)

            image_page_params = html.xpath(

                '//*[@id="page_container"]//div//div[@class="thumb-container"]//div[@class="boxgrid"]//@href')

            # 判断当前页面有没有图片

            if len(image_page_params) == 0:

                raise Error("获取不到当前页码的图片,请检查页码有否有效!")

            else:

                result = []

                for image_page_param in image_page_params:

                    image_page_url = url + image_page_param

                    response_image = requests.get(image_page_url).text

                    html = etree.HTML(response_image)

                    image_urls = html.xpath("/html/body/div[2]/div[2]/div[2]/div[1]/img//@src")

                    # 这里可以用推导式

                    for in image_urls:

                        result.append(i)

                return len(result), result

        raise Error("获取不到当前页码的图片,请检查页码有否有效!")

    def download_image(self):

        mun = 0

        error = 0

        self.create_dir(self.path)

        images_dir_name = self.get_folder_name()

        images_mun, images_urls = self.get_image_urls()

        for image_url in images_urls:

            image_name_temp = re.search(r'https://[^/]+/[^/]+/(\d+)(\.png|\.jpg)', image_url)

            # 匹配图片页面的图片是否是这两种格式,不是则跳过

            if image_name_temp is not None:

                image_name = image_name_temp.group(1)

                # 判断图片是否重复下载

                if not os.path.exists(os.path.join(self.path, images_dir_name, image_name + ".png")):

                    self.create_dir(os.path.join(self.path, images_dir_name))

                    download = requests.get(image_url).content

                    with open(os.path.join(self.path, images_dir_name, image_name + ".png"), "wb") as f:

                        f.write(download)

                    print("图片 {} 下载完成,图片地址: {}".format(image_name, image_url))

                    mun = mun + 1

                    if mun == images_mun:

                        print("当前页面图片下载完成, 一共 {} 张图片".format(mun))

                else:

                    print("已有图片: {}, 图片地址: {}".format(image_name, image_url))

                    continue

            else:

                error = error + 1

                print("下载失败 {} 张图片".format(error))

if __name__ == '__main__':

    url = input("请输入壁纸url! \n")

    # 存放图片的文件夹

    path = "images"

    '''

    分辨率

    0x0 所有分辨率

    '''

    print("""

        常用分辨率

        1920x1080

        2560x1440

        2560x1600

        3840x2160

        5120x2880

        7680x4320 

    """)

    resolution = input("请输入需要下载的分辨率! \n")

    # 200 可以修为自己想怕的最大页数,当链接所有页数大于或小于都能正常爬取链接的最大页数的图片,这里可以取一个折中的值

    for page_num in range(1200):

        print("正在下载第 {} 页的图片".format(page_num))

        page_one = Wallpaper(url, path, str(page_num), resolution)

        print(page_one.data)

Logo

加入社区!打开量化的大门,首批课程上线啦!

更多推荐