Files
spider-ctext/佩文韵府/robust_download_aio.py
2026-03-22 16:18:35 +08:00

66 lines
2.1 KiB
Python

import asyncio
import aiohttp
from aiohttp_socks import ProxyConnector, ProxyType
import os
import urllib.parse
import time
async def download_file(session, url, filename, semaphore):
async with semaphore:
if os.path.exists(filename):
print(f"Skipping {filename}")
return True
retries = 3
while retries > 0:
try:
async with session.get(url, timeout=30) as response:
if response.status == 200:
content = await response.read()
with open(filename, "wb") as f:
f.write(content)
print(f"Successfully downloaded {filename}")
return True
else:
print(f"HTTP Error {response.status} for {url}")
except Exception as e:
print(f"Error for {url}: {e}")
retries -= 1
await asyncio.sleep(2)
print(f"Failed all retries for {url}")
return False
async def main():
if not os.path.exists("html_files"):
os.makedirs("html_files")
with open("missing_urls.txt", "r", encoding="utf-8") as f:
urls = [line.strip() for line in f if line.strip()]
# To satisfy the instruction: SOCKS5 proxy 127.0.0.1:10808
# aiohttp_socks uses socks5:// instead of socks5h:// but with rdns=True it is equivalent.
proxy_url = "socks5://127.0.0.1:10808"
connector = ProxyConnector.from_url(proxy_url, rdns=True)
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
semaphore = asyncio.Semaphore(10) # 10 concurrent downloads
async with aiohttp.ClientSession(connector=connector, headers=headers) as session:
tasks = []
for url in urls:
decoded_url = urllib.parse.unquote(url)
volume_name = decoded_url.split("/")[-1]
filename = f"html_files/{volume_name}.html"
tasks.append(download_file(session, url, filename, semaphore))
await asyncio.gather(*tasks)
if __name__ == "__main__":
asyncio.run(main())