import asyncio import aiohttp from aiohttp_socks import ProxyConnector, ProxyType import os import urllib.parse import time async def download_file(session, url, filename, semaphore): async with semaphore: if os.path.exists(filename): print(f"Skipping {filename}") return True retries = 3 while retries > 0: try: async with session.get(url, timeout=30) as response: if response.status == 200: content = await response.read() with open(filename, "wb") as f: f.write(content) print(f"Successfully downloaded {filename}") return True else: print(f"HTTP Error {response.status} for {url}") except Exception as e: print(f"Error for {url}: {e}") retries -= 1 await asyncio.sleep(2) print(f"Failed all retries for {url}") return False async def main(): if not os.path.exists("html_files"): os.makedirs("html_files") with open("missing_urls.txt", "r", encoding="utf-8") as f: urls = [line.strip() for line in f if line.strip()] # To satisfy the instruction: SOCKS5 proxy 127.0.0.1:10808 # aiohttp_socks uses socks5:// instead of socks5h:// but with rdns=True it is equivalent. proxy_url = "socks5://127.0.0.1:10808" connector = ProxyConnector.from_url(proxy_url, rdns=True) headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"} semaphore = asyncio.Semaphore(10) # 10 concurrent downloads async with aiohttp.ClientSession(connector=connector, headers=headers) as session: tasks = [] for url in urls: decoded_url = urllib.parse.unquote(url) volume_name = decoded_url.split("/")[-1] filename = f"html_files/{volume_name}.html" tasks.append(download_file(session, url, filename, semaphore)) await asyncio.gather(*tasks) if __name__ == "__main__": asyncio.run(main())