python图像爬虫
关键词搜索
import requests
import os
page = input('请输入要爬取多少页(每页30张图):')
queryWord = input('请输入要爬取什么:')
# page = 300
# queryWord = '车内儿童座椅'
# 车内 内饰 车内副驾驶视角 真实车内图片 车内儿童座椅
page = int(page) + 1
header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
n = 0
pn = 1
# pn是从第几张图片获取 百度图片下滑时默认一次性显示30张
for m in range(1, page):
print('page:%%d
' %% (m))
url = 'https://image.baidu.com/search/acjson?'
param = {
'tn': 'resultjson_com',
'logid': '8846269338939606587',
'ipn': 'rj',
'ct': '201326592',
'is': '',
'fp': 'result',
'queryWord': queryWord,
'cl': '2',
'lm': '-1',
'ie': 'utf-8',
'oe': 'utf-8',
'adpicid': '',
'st': '-1',
'z': '',
'ic': '',
'hd': '',
'latest': '',
'copyright': '',
'word': queryWord,
's': '',
'se': '',
'tab': '',
'width': '',
'height': '',
'face': '0',
'istype': '2',
'qc': '',
'nc': '1',
'fr': '',
'expermode': '',
'force': '',
'cg': 'girl',
'pn': pn, # 从第几张图片开始
'rn': '30', # 多少张图片
'gsm': '1e',
}
page_text = requests.get(url=url, headers=header, params=param)
page_text.encoding = 'utf-8'
try:
page_text = page_text.json()
except:
pn += 29
continue
info_list = page_text['data']
del info_list[-1]
img_path_list = []
for i in info_list:
img_path_list.append(i['thumbURL'])
if(len(info_list) == 0):
break
for img_path in img_path_list:
img_data = requests.get(url=img_path, headers=header).content
save_path = './download/'+queryWord+'/' #保存路径
if(not os.path.exists(save_path)):
os.makedirs(save_path)
img_path = save_path + str(n) + '.jpg'
with open(img_path, 'wb') as fp:
fp.write(img_data)
n = n + 1
pn += 29
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
输入想要爬取的数量和关键词
运行效果:
以图搜图爬虫
- 创建图像文件夹,把想要搜索的图像放在该文件夹下
例如 在当前目录下创建’./false_neg/’ - 运行img2img.py
import numpy as np
from urllib.parse import urlparse, parse_qs
import requests
import re
import os
import json
###############################################################
img_seed = "./false_neg/" #替换成自己的图像文件夹
###############################################################
img_ls = os.listdir(img_seed)
for img in img_ls:
if('.jpg' not in img):
continue
img_path = img_seed+img
data = {
'image': open(img_path, 'rb')
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
r = requests.post(
'https://graph.baidu.com/upload?tn=pc&from=pc&image_source=PC_UPLOAD_IMAGE_MOVE&range={%%22page_from%%22:%%20%%22shituIndex%%22}&extUiData%%5bisLogoShow%%5d=1', files=data, headers=headers).text
url = json.loads(r)["data"]["url"]
o = urlparse(url)
q = parse_qs(o.query, True)
sign = q['sign'][0]
r1 = requests.get(url, headers=headers).text
r0 = requests.get(
"https://graph.baidu.com/ajax/pcsimi?sign={}".format(sign)).text
l = json.loads(r0)["data"]["list"]
img_path_list = []
for i in l:
img_path_list.append(i['thumbUrl'])
n = 0
for img_path in img_path_list:
img_data = requests.get(url=img_path, headers=headers).content
save_path = img_seed+img.split('.')[0]+'/' #保存路径
if(not os.path.exists(save_path)):
os.makedirs(save_path)
img_path = save_path + img + str(n) + '.jpg'
with open(img_path, 'wb') as fp:
fp.write(img_data)
n = n + 1
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
推荐阅读