from selenium import webdriver from import By import time import os
import requests
import imghdr
chrome_options = webdriver.ChromeOptions() user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.3 Safari/605.1.15"
chrome_options.add_argument('--user-agent=%s' % user_agent)
chrome_options.add_argument('--ignore-certificate-errors') chrome_options.add_argument('--ignore-ssl-errors') chrome_options.add_argument('--ignore-certificate-errors-spki-list')
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options.add_experimental_option('excludeSwitches', ['enable-logging']) chrome_options.add_argument('blink-settings=imagesEnabled=false')
driver = webdriver.Chrome(options=chrome_options)
def checkFileName(txt): dictionary = {" ":"","|":" ","/":" ","<":"[",">":"]",":":"_","\"":"'","+":"_"} transTable = txt.maketrans(dictionary) txt = txt.translate(transTable) return txt
def pulldown(): print("---Pulling Down The Page---") time.sleep(5)
cnt=0 check_height = driver.execute_script("return document.body.scrollHeight;") while True: driver.execute_script("window.scrollBy(0,1200)") time.sleep(1.2) check_height1 = driver.execute_script("return document.body.scrollHeight;") if check_height == check_height1: cnt+=1 print(cnt) if cnt>=25: print("---Back To The Main---") break else: check_height = check_height1 if cnt<20: cnt=0
def isCompleted(img): if not os.path.exists(img): return 0 if not imghdr.what(img): print('image not completely downloaded, i\'m trying again now.') return 0 else: return 1
def imgOutput(imgUrl, i, savePath): filename=savePath+f'\\{i:0>4}.jpg' r = requests.get(imgUrl) with open(filename,'wb') as f: f.write(r.content) return 1
def getImgs(url, savePath):
driver.get(url) pulldown()
imgs = driver.find_elements( by=By.XPATH, value="//div[@class='chapter-images wide-block pt-2 pb-2 my-bg-white']/img")
if not imgs: print("no more pages") return 0
if not os.path.exists(savePath): os.makedirs(savePath)
jpgn = 0 start=time.time() for i in imgs: imgUrl = str(i.get_attribute("src")) if imgUrl.find(".gif") == -1: time.sleep(1.5) cnt=0 while cnt<=5: if imgOutput(imgUrl, jpgn, savePath): break; imgUrl = str(i.get_attribute("src")) time.sleep(1) print(f"Something Wrong Here. Cnt={cnt}") cnt+=1
if (cnt>5): print(imgUrl) print("Sorry, I can't deal with this error, please check this url later") else: if cnt>0: print("Luckily, it has been successfully processed now. ") print(f"---{jpgn:0>4}.jpg has been downloaded in /{savePath[-6:]}---")
jpgn += 1 end=time.time() print(f"||This Chapt Consumed {end-start} seconds.||")
if jpgn == 0: return 0 else: return 1
driver.get('') time.sleep(10)
driver.get( "") time.sleep(10)
bcnt=0 for cid in idlist:
print(f"\n\n|||||||Reading Book {bcnt}||||||") url = f"{cid}&chapter=" chapN = 3 urlC = url+str(chapN)
driver.get(f"{cid}") time.sleep(2) urlTitle=driver.find_element(by=By.XPATH, value="//div[@class='comic-title text-start']").text urlTitle=checkFileName(urlTitle)
print(f">>Loading Book {bcnt}:[{cid}] {urlTitle}")
savePath = "G:\\Comic\\Spider\\"+urlTitle+"\\"
while getImgs(urlC, savePath+f'Chap{chapN:0>2}'): print(f"Chap{chapN} is Finished") chapN += 1 urlC = url+str(chapN) end=time.time()
print(f"This Book Consumed {end-start} seconds.") print(f"||||||Book {bcnt}:[{cid}] {urlTitle} have been downloaded. ||||||")
print("\n\nおめでとうございます! All Books Have Been Processed Successfully.")