spider-ctext/五车韵瑞/scrape_wuche.py

import asyncio
import json
import re
from playwright.async_api import async_playwright

CTEXT_INDEX_URL = "https://ctext.org/wiki.pl?if=gb&res=87723&remap=gb"

async def scrape_ctext():
    results = {}

    # 启动Playwright（必须使用非无头模式以绕过反爬）
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        context = await browser.new_context(
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
            viewport={"width": 1280, "height": 800}
        )
        page = await context.new_page()

        print(f"正在访问目录页: {CTEXT_INDEX_URL}")
        await page.goto(CTEXT_INDEX_URL, wait_until="domcontentloaded")

        # 停顿等待Cloudflare盾
        await page.wait_for_timeout(5000)

        # 提取所有章节链接
        links = await page.locator("a[href*='wiki.pl?if=gb&chapter=']").all()
        chapter_urls = []
        for link in links:
            href = await link.get_attribute("href")
            text = await link.inner_text()
            if href:
                full_url = "https://ctext.org/" + href
                if full_url not in chapter_urls:
                    chapter_urls.append(full_url)

        print(f"找到 {len(chapter_urls)} 个章节链接。")

        # 遍历前几个章节作为示例
        for url in chapter_urls[:2]:
            print(f"正在抓取章节: {url}")
            await page.goto(url, wait_until="domcontentloaded")
            await page.wait_for_timeout(3000)

            # 提取文本区
            text_content = await page.evaluate("""() => {
                const tds = document.querySelectorAll('td.ctext');
                let text = '';
                tds.forEach(td => { text += td.innerText + '\\n'; });
                return text;
            }""")

            print("--- 抓取到的原始文本前100字 ---")
            print(text_content[:100])
            print("--------------------------------")

            # 解析逻辑（基于假设的文本结构）
            current_volume = "未知卷"
            current_rhyme = "未知韵"
            current_tone = "平声"

            lines = text_content.split('\n')
            for line in lines:
                line = line.strip()
                if not line: continue

                # 尝试识别卷和韵（注：CText上的维基文本实际上是未经校对的乱码OCR，因此这里的正则很难完美匹配）
                if line.startswith("卷"):
                    current_volume = line
                    continue
                if "声" in line and len(line) < 5:
                    current_tone = line
                    continue

                # 尝试分离词条和内容（假设词条在行首且长度<4）
                parts = re.split(r'[:：\s]', line, maxsplit=1)
                if len(parts) == 2:
                    word = parts[0]
                    content = parts[1]
                else:
                    word = line[0:2] if len(line) > 2 else line
                    content = line[2:]

                # 按用户要求格式构建字典
                if word not in results:
                    results[word] = []

                results[word].append({
                    "卷": current_volume,
                    "大韵": current_rhyme,
                    "声调": current_tone,
                    "词条内容": content
                })

        await browser.close()

    # 保存为JSON
    with open("五车韵瑞_示例.json", "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    print("提取完成。由于源数据本身存在大量OCR乱码，JSON数据可能需要大量人工清洗。")

if __name__ == "__main__":
    asyncio.run(scrape_ctext())