注意!!!运行这个脚本需要安装 requests 库
安装方法看
#一些库的用处:
# requests 用于下载网页源码 第三方库 通过pip安装
# re 正则表达式要用到
# threading 启用多线程
# os,sys 用于创建目录
# time 用于获得当天时间
import requests
import re
import threading
import os
import sys
import time
class Final:
'''save final for pixiv spider
保存和p站爬虫相关的常量'''
Headers={
#'host':'www.pixiv.net',
'referer':'https://i.pximg.net',
#'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0',
#'accept-language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
#'accept-encoding':'gzip, deflate, br',
#'connection':'keep-alive',
#'upgrade-insecure-requests':'1',
#'cache-control':'max-age=0',
#'cookie':'p_ab_id=4; p_ab_id_2=8; device_token=91c3112fe5a33aa8c9d02d8b37403500; module_orders_mypage=%5B%7B%22name%22%3A%22recommended_illusts%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22everyone_new_illusts%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22following_new_illusts%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22mypixiv_new_illusts%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22fanbox%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22featured_tags%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22contests%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22sensei_courses%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22spotlight%22%2C%22visible%22%3Atrue%7D%2C%7B%22name%22%3A%22booth_follow_items%22%2C%22visible%22%3Atrue%7D%5D; login_ever=yes; __utma=235335808.562646940.1501219988.1501781881.1501791377.12; __utmz=235335808.1501329291.5.3.utmcsr=accounts.pixiv.net|utmccn=(referral)|utmcmd=referral|utmcct=/login; __utmv=235335808.|2=login%20ever=yes=1^3=plan=normal=1^5=gender=female=1^6=user_id=26394916=1^9=p_ab_id=4=1^10=p_ab_id_2=8=1^11=lang=zh=1; _ga=GA1.2.562646940.1501219988; auto_view_enabled=1; PHPSESSID=26394916_4b100233c217b05721b53059aa2d27ec; is_sensei_service_user=1'
}
def DownloadSourcepng(image_url,imge_ref,imgid): #下载png格式函数
image_url=image_url.replace('jpg','png') #把jpg后缀改为png
res=requests.get(image_url,headers=imge_ref)
if res.status_code!=200: #如果不等于200说明下载失败
print(image_url) #打印出url
print(res.status_code) #打印出错误码
print('download failed...')
return False
else:
print (image_url)
print('OK')
with open(wz+imgid+".png",'wb') as f:
f.write(res.content)
f.close()
return True
def DownloadSourcejpg(image_url,imge_ref,imgid): #下载jpg格式函数
'''a simple download function bind with Final class
一个简单的下载函数,和Final绑定在一起,headers=Final.Headers'''
res=requests.get(image_url,headers=imge_ref)
if res.status_code!=200: #如果下载失败可能是png格式 跳去png下载函数
#print(res.status_code)
#print('download failed...')
DownloadSourcepng(image_url,imge_ref,imgid)
else:
print(image_url)
print('OK')
with open(wz+imgid+".jpg",'wb') as f:
f.write(res.content)
f.close()
return True
r = requests.get(url='https://www.pixiv.net/ranking.php?mode=daily') # 最基本的GET请求
CurrentPath = os.getcwd() #获得当前目录
CurrentPath=CurrentPath.replace('','/') #把替换为/ 这里用了两个 是因为需要转译
rq=time.strftime('%Y-%m-%d',time.localtime(time.time())) #获得时间并进行简单处理
wz=CurrentPath+"/"+rq+"/"#保存路径 #最后得到的路径
os.mkdir(wz) #创建一个以今天日期为名字的文件夹
html=r.text #获得网页源码
IMG=r'src="https://i.pximg.net/c/[0-9]*x[0-9]*/img-master/img/[0-9]*/[0-9]*/[0-9]*/[0-9]*/[0-9]*/[0-9]*/[0-9]*_p0_master1200.jpg"' #通过正则表达式获得需要的url
ls=re.findall(IMG,html)
for i in ls: #一个for循环 如果有数据就执行下面的内容
imgxurl=(i.split(""")[1]) #对字符串进行处理 这里是删掉了一些没用的字符串
imgyurl=(imgxurl.replace('/c/240x480/img-master','/img-original')).replace('_master1200','') #把缩略图转换成大图 转换出来的后缀都是jpg 不一定都能下载 所以我写了两个下载函数 一个jpg 一个png
imgid=imgyurl[-15:-7] #把图片id提取出来 当文件名用
#DownloadSourcejpg(imgyurl,Final.Headers,imgid)
xiancheng=threading.Thread(target=DownloadSourcejpg,args=(imgyurl,Final.Headers,imgid)) #每下载一张图片都启动一个线程
xiancheng.start() #启动线程
可以抓取每日50图
多线程抓取
运行效果
由于一些不可描述的原因,大陆地区需要魔法访问