spider-ctext/佩文韵府/robust_download_aio.py

import asyncio
import aiohttp
from aiohttp_socks import ProxyConnector, ProxyType
import os
import urllib.parse
import time


async def download_file(session, url, filename, semaphore):
    async with semaphore:
        if os.path.exists(filename):
            print(f"Skipping {filename}")
            return True

        retries = 3
        while retries > 0:
            try:
                async with session.get(url, timeout=30) as response:
                    if response.status == 200:
                        content = await response.read()
                        with open(filename, "wb") as f:
                            f.write(content)
                        print(f"Successfully downloaded {filename}")
                        return True
                    else:
                        print(f"HTTP Error {response.status} for {url}")
            except Exception as e:
                print(f"Error for {url}: {e}")

            retries -= 1
            await asyncio.sleep(2)

        print(f"Failed all retries for {url}")
        return False


async def main():
    if not os.path.exists("html_files"):
        os.makedirs("html_files")

    with open("missing_urls.txt", "r", encoding="utf-8") as f:
        urls = [line.strip() for line in f if line.strip()]

    # To satisfy the instruction: SOCKS5 proxy 127.0.0.1:10808
    # aiohttp_socks uses socks5:// instead of socks5h:// but with rdns=True it is equivalent.
    proxy_url = "socks5://127.0.0.1:10808"
    connector = ProxyConnector.from_url(proxy_url, rdns=True)

    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}

    semaphore = asyncio.Semaphore(10)  # 10 concurrent downloads

    async with aiohttp.ClientSession(connector=connector, headers=headers) as session:
        tasks = []
        for url in urls:
            decoded_url = urllib.parse.unquote(url)
            volume_name = decoded_url.split("/")[-1]
            filename = f"html_files/{volume_name}.html"
            tasks.append(download_file(session, url, filename, semaphore))

        await asyncio.gather(*tasks)


if __name__ == "__main__":
    asyncio.run(main())