Update: 初学记、佩文韵府 and 五车韵瑞
This commit is contained in:
65
佩文韵府/robust_download_aio.py
Normal file
65
佩文韵府/robust_download_aio.py
Normal file
@@ -0,0 +1,65 @@
|
||||
import asyncio
|
||||
import aiohttp
|
||||
from aiohttp_socks import ProxyConnector, ProxyType
|
||||
import os
|
||||
import urllib.parse
|
||||
import time
|
||||
|
||||
|
||||
async def download_file(session, url, filename, semaphore):
|
||||
async with semaphore:
|
||||
if os.path.exists(filename):
|
||||
print(f"Skipping {filename}")
|
||||
return True
|
||||
|
||||
retries = 3
|
||||
while retries > 0:
|
||||
try:
|
||||
async with session.get(url, timeout=30) as response:
|
||||
if response.status == 200:
|
||||
content = await response.read()
|
||||
with open(filename, "wb") as f:
|
||||
f.write(content)
|
||||
print(f"Successfully downloaded {filename}")
|
||||
return True
|
||||
else:
|
||||
print(f"HTTP Error {response.status} for {url}")
|
||||
except Exception as e:
|
||||
print(f"Error for {url}: {e}")
|
||||
|
||||
retries -= 1
|
||||
await asyncio.sleep(2)
|
||||
|
||||
print(f"Failed all retries for {url}")
|
||||
return False
|
||||
|
||||
|
||||
async def main():
|
||||
if not os.path.exists("html_files"):
|
||||
os.makedirs("html_files")
|
||||
|
||||
with open("missing_urls.txt", "r", encoding="utf-8") as f:
|
||||
urls = [line.strip() for line in f if line.strip()]
|
||||
|
||||
# To satisfy the instruction: SOCKS5 proxy 127.0.0.1:10808
|
||||
# aiohttp_socks uses socks5:// instead of socks5h:// but with rdns=True it is equivalent.
|
||||
proxy_url = "socks5://127.0.0.1:10808"
|
||||
connector = ProxyConnector.from_url(proxy_url, rdns=True)
|
||||
|
||||
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
|
||||
|
||||
semaphore = asyncio.Semaphore(10) # 10 concurrent downloads
|
||||
|
||||
async with aiohttp.ClientSession(connector=connector, headers=headers) as session:
|
||||
tasks = []
|
||||
for url in urls:
|
||||
decoded_url = urllib.parse.unquote(url)
|
||||
volume_name = decoded_url.split("/")[-1]
|
||||
filename = f"html_files/{volume_name}.html"
|
||||
tasks.append(download_file(session, url, filename, semaphore))
|
||||
|
||||
await asyncio.gather(*tasks)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user