发布于 

Python爬取picaComic

net #python

第一次的python-selenium爬虫,动态网页确实复杂很多啊(……)
现在回头看觉得到处是瑕疵了hhh
做一个纪念吧。

Source Code ( using python-selenium )
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import os #文件管理
# import imghdr #图像查损
import requests
# import urllib
import imghdr

#---------爬虫初始化----------
chrome_options = webdriver.ChromeOptions()
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.3 Safari/605.1.15"
# 增加用户代理
chrome_options.add_argument('--user-agent=%s' % user_agent)
# 忽略证书错误
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument('--ignore-ssl-errors')
chrome_options.add_argument('--ignore-certificate-errors-spki-list')
# 忽略 Bluetooth: bluetooth_adapter_winrt.cc:1075 Getting Default Adapter failed. 错误
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
# 忽略 DevTools listening on ws://127.0.0.1... 提示
chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
chrome_options.add_argument('blink-settings=imagesEnabled=false')

driver = webdriver.Chrome(options=chrome_options) #模拟器启动

# opener = urllib.request.build_opener() #防反爬
# opener.addheaders = [('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36')]
# urllib.request.install_opener(opener)

# 先加载完所有链接再下载,最稳定的版本
idlist=["63c00254380963328e1d487c"]
#漫画地址list"63c00254380963328e1d487c",

def checkFileName(txt):
dictionary = {" ":"","|":" ","/":" ","<":"[",">":"]",":":"_","\"":"'","+":"_"}
transTable = txt.maketrans(dictionary)
txt = txt.translate(transTable)
return txt

def pulldown():
print("---Pulling Down The Page---")
time.sleep(5) #进入一个新的网页,先稍等加载一会

cnt=0
check_height = driver.execute_script("return document.body.scrollHeight;")
while True:
driver.execute_script("window.scrollBy(0,1200)") #太快容易中断
time.sleep(1.2)
check_height1 = driver.execute_script("return document.body.scrollHeight;")
if check_height == check_height1:
cnt+=1
print(cnt)
if cnt>=25:
print("---Back To The Main---")
break
else:
check_height = check_height1
if cnt<20:
cnt=0

def isCompleted(img): #图像完整性检查(因为urlretrieve稳定性很好,所以不开也没关系
if not os.path.exists(img):
return 0
if not imghdr.what(img):
print('image not completely downloaded, i\'m trying again now.')
return 0
else:
return 1


def imgOutput(imgUrl, i, savePath):
filename=savePath+f'\\{i:0>4}.jpg'
r = requests.get(imgUrl)
##打开文件并写入
with open(filename,'wb') as f:
f.write(r.content)
# urllib.request.urlretrieve(url=imgUrl, filename=savePath+f'\\{i:0>4}.jpg')
# if isCompleted(savePath+f'\\{i:0>4}.jpg'):
# return 1 # 检查图片完整性(如果用这一段,下面改0)
return 1


def getImgs(url, savePath):

driver.get(url)
pulldown() #下拉,刷新所有链接

imgs = driver.find_elements(
by=By.XPATH, value="//div[@class='chapter-images wide-block pt-2 pb-2 my-bg-white']/img")
#在下拉过程中,元素的src属性会刷新

if not imgs:
print("no more pages")
return 0

if not os.path.exists(savePath):
os.makedirs(savePath)

jpgn = 0
start=time.time()
for i in imgs:
# if isCompleted(savePath+f'\\{jpgn:0>4}.jpg'): #断点续传
# jpgn+=1
# print(f"---{jpgn}.jpg has existed in *{savePath[-6:]}*---")
# else:
imgUrl = str(i.get_attribute("src"))
if imgUrl.find(".gif") == -1: #不下载gif
time.sleep(1.5)
cnt=0
while cnt<=5:
if imgOutput(imgUrl, jpgn, savePath):
break;
imgUrl = str(i.get_attribute("src"))
time.sleep(1)
print(f"Something Wrong Here. Cnt={cnt}")
cnt+=1

if (cnt>5):
print(imgUrl)
print("Sorry, I can't deal with this error, please check this url later")
else:
if cnt>0:
print("Luckily, it has been successfully processed now. ")
print(f"---{jpgn:0>4}.jpg has been downloaded in /{savePath[-6:]}---")

jpgn += 1
end=time.time()
print(f"||This Chapt Consumed {end-start} seconds.||")

if jpgn == 0:
return 0
else:
return 1

driver.get('https://manhuabika.com/plogin/')
time.sleep(10) # 登录

driver.get(
"https://manhuabika.com/pchapter/?cid=5fde2af559b287406e2151b8&chapter=1")
time.sleep(10) # 修改清晰度

bcnt=0
for cid in idlist:

start=time.time()

#---------每本书的初始化---------
print(f"\n\n|||||||Reading Book {bcnt}||||||")
url = f"https://manhuabika.com/pchapter/?cid={cid}&chapter="
chapN = 3 #起始章
urlC = url+str(chapN)

#---------获取书名----------
driver.get(f"https://manhuabika.com/pcomicview/?cid={cid}")
time.sleep(2)
urlTitle=driver.find_element(by=By.XPATH, value="//div[@class='comic-title text-start']").text
urlTitle=checkFileName(urlTitle)

print(f">>Loading Book {bcnt}:[{cid}] {urlTitle}")

savePath = "G:\\Comic\\Spider\\"+urlTitle+"\\" #书名作为文件夹名字

#---------分章节爬取图像----------
while getImgs(urlC, savePath+f'Chap{chapN:0>2}'):
print(f"Chap{chapN} is Finished")
chapN += 1
urlC = url+str(chapN)

#---------每本书的结尾----------
end=time.time()

print(f"This Book Consumed {end-start} seconds.")
print(f"||||||Book {bcnt}:[{cid}] {urlTitle} have been downloaded. ||||||")

bcnt+=1

print("\n\nおめでとうございます! All Books Have Been Processed Successfully.")