Python爬取picaComic

net #python

第一次的python-selenium爬虫，动态网页确实复杂很多啊（……）
现在回头看觉得到处是瑕疵了hhh
做一个纪念吧。
Source Code ( using python-selenium )
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import os #文件管理
# import imghdr  #图像查损
import requests
# import urllib
import imghdr

#---------爬虫初始化----------
chrome_options = webdriver.ChromeOptions()
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.3 Safari/605.1.15"
# 增加用户代理
chrome_options.add_argument('--user-agent=%s' % user_agent)
# 忽略证书错误
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument('--ignore-ssl-errors') 
chrome_options.add_argument('--ignore-certificate-errors-spki-list')
# 忽略 Bluetooth: bluetooth_adapter_winrt.cc:1075 Getting Default Adapter failed. 错误
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
# 忽略 DevTools listening on ws://127.0.0.1... 提示
chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
chrome_options.add_argument('blink-settings=imagesEnabled=false') 

driver = webdriver.Chrome(options=chrome_options)  #模拟器启动

# opener = urllib.request.build_opener() #防反爬
# opener.addheaders = [('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36')]
# urllib.request.install_opener(opener)

# 先加载完所有链接再下载，最稳定的版本
idlist=["63c00254380963328e1d487c"]
#漫画地址list"63c00254380963328e1d487c",

def checkFileName(txt):
  dictionary = {" ":"","|":" ","/":" ","<":"[",">":"]",":":"_","\"":"'","+":"_"}
  transTable = txt.maketrans(dictionary)
  txt = txt.translate(transTable)
  return txt

def pulldown():
    print("---Pulling Down The Page---")
    time.sleep(5) #进入一个新的网页，先稍等加载一会

    cnt=0
    check_height = driver.execute_script("return document.body.scrollHeight;")
    while True:
        driver.execute_script("window.scrollBy(0,1200)")  #太快容易中断
        time.sleep(1.2)
        check_height1 = driver.execute_script("return document.body.scrollHeight;")
        if check_height == check_height1:
            cnt+=1
            print(cnt)
            if cnt>=25:
                print("---Back To The Main---")
                break
        else:
            check_height = check_height1
            if cnt<20:
                cnt=0

def isCompleted(img): #图像完整性检查（因为urlretrieve稳定性很好，所以不开也没关系
    if not os.path.exists(img):
        return 0
    if not imghdr.what(img):
        print('image not completely downloaded, i\'m trying again now.')
        return 0
    else:
        return 1


def imgOutput(imgUrl, i, savePath):
    filename=savePath+f'\\{i:0>4}.jpg'
    r = requests.get(imgUrl)
    ##打开文件并写入
    with open(filename,'wb') as f:
        f.write(r.content)
    # urllib.request.urlretrieve(url=imgUrl, filename=savePath+f'\\{i:0>4}.jpg')
    # if isCompleted(savePath+f'\\{i:0>4}.jpg'):
    #     return 1 # 检查图片完整性（如果用这一段，下面改0）
    return 1


def getImgs(url, savePath):

    driver.get(url)
    pulldown()  #下拉，刷新所有链接

    imgs = driver.find_elements(
        by=By.XPATH, value="//div[@class='chapter-images wide-block pt-2 pb-2 my-bg-white']/img")
    #在下拉过程中，元素的src属性会刷新

    if not imgs:
        print("no more pages")
        return 0

    if not os.path.exists(savePath):
        os.makedirs(savePath)

    jpgn = 0
    start=time.time()
    for i in imgs:
        # if isCompleted(savePath+f'\\{jpgn:0>4}.jpg'): #断点续传
        #    jpgn+=1
        #    print(f"---{jpgn}.jpg has existed in *{savePath[-6:]}*---")
        # else:
            imgUrl = str(i.get_attribute("src"))
            if imgUrl.find(".gif") == -1:  #不下载gif
                time.sleep(1.5)
                cnt=0
                while cnt<=5:
                    if imgOutput(imgUrl, jpgn, savePath):
                        break;
                    imgUrl = str(i.get_attribute("src"))
                    time.sleep(1)
                    print(f"Something Wrong Here. Cnt={cnt}")
                    cnt+=1

                if (cnt>5):
                    print(imgUrl)
                    print("Sorry, I can't deal with this error, please check this url later")
                else:
                    if cnt>0:
                        print("Luckily, it has been successfully processed now. ")
                    print(f"---{jpgn:0>4}.jpg has been downloaded in /{savePath[-6:]}---")

                jpgn += 1
    end=time.time()
    print(f"||This Chapt Consumed {end-start} seconds.||")

    if jpgn == 0:
        return 0
    else:
        return 1

driver.get('https://manhuabika.com/plogin/')
time.sleep(10)  # 登录

driver.get(
    "https://manhuabika.com/pchapter/?cid=5fde2af559b287406e2151b8&chapter=1")
time.sleep(10)  # 修改清晰度

bcnt=0
for cid in idlist:

    start=time.time()

    #---------每本书的初始化---------
    print(f"\n\n|||||||Reading Book {bcnt}||||||")
    url = f"https://manhuabika.com/pchapter/?cid={cid}&chapter="
    chapN = 3 #起始章
    urlC = url+str(chapN)

    #---------获取书名----------
    driver.get(f"https://manhuabika.com/pcomicview/?cid={cid}") 
    time.sleep(2)
    urlTitle=driver.find_element(by=By.XPATH, value="//div[@class='comic-title text-start']").text
    urlTitle=checkFileName(urlTitle)

    print(f">>Loading Book {bcnt}:[{cid}] {urlTitle}")

    savePath = "G:\\Comic\\Spider\\"+urlTitle+"\\" #书名作为文件夹名字

    #---------分章节爬取图像----------
    while getImgs(urlC, savePath+f'Chap{chapN:0>2}'):
        print(f"Chap{chapN} is Finished")
        chapN += 1
        urlC = url+str(chapN)
    
    #---------每本书的结尾----------
    end=time.time()

    print(f"This Book Consumed {end-start} seconds.")
    print(f"||||||Book {bcnt}:[{cid}] {urlTitle} have been downloaded. ||||||")

    bcnt+=1

print("\n\nおめでとうございます! All Books Have Been Processed Successfully.")
本文采用署名-非商业性使用-相同方式共享 4.0 国际许可协议，转载请注明出处。