Update: 初学记、佩文韵府 and 五车韵瑞

2026-03-22 16:18:35 +08:00
parent df475fd03f
commit 183b842090
553 changed files with 754048 additions and 169 deletions
--- a/五车韵瑞/scrape_wuche.py
+++ b/五车韵瑞/scrape_wuche.py
@@ -0,0 +1,104 @@
+import asyncio
+import json
+import re
+from playwright.async_api import async_playwright
+
+CTEXT_INDEX_URL = "https://ctext.org/wiki.pl?if=gb&res=87723&remap=gb"
+
+async def scrape_ctext():
+    results = {}
+    
+    # 启动Playwright（必须使用非无头模式以绕过反爬）
+    async with async_playwright() as p:
+        browser = await p.chromium.launch(headless=False)
+        context = await browser.new_context(
+            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+            viewport={"width": 1280, "height": 800}
+        )
+        page = await context.new_page()
+        
+        print(f"正在访问目录页: {CTEXT_INDEX_URL}")
+        await page.goto(CTEXT_INDEX_URL, wait_until="domcontentloaded")
+        
+        # 停顿等待Cloudflare盾
+        await page.wait_for_timeout(5000)
+        
+        # 提取所有章节链接
+        links = await page.locator("a[href*='wiki.pl?if=gb&chapter=']").all()
+        chapter_urls = []
+        for link in links:
+            href = await link.get_attribute("href")
+            text = await link.inner_text()
+            if href:
+                full_url = "https://ctext.org/" + href
+                if full_url not in chapter_urls:
+                    chapter_urls.append(full_url)
+        
+        print(f"找到 {len(chapter_urls)} 个章节链接。")
+        
+        # 遍历前几个章节作为示例
+        for url in chapter_urls[:2]:
+            print(f"正在抓取章节: {url}")
+            await page.goto(url, wait_until="domcontentloaded")
+            await page.wait_for_timeout(3000)
+            
+            # 提取文本区
+            text_content = await page.evaluate("""() => {
+                const tds = document.querySelectorAll('td.ctext');
+                let text = '';
+                tds.forEach(td => { text += td.innerText + '\\n'; });
+                return text;
+            }""")
+            
+            print("--- 抓取到的原始文本前100字 ---")
+            print(text_content[:100])
+            print("--------------------------------")
+            
+            # 解析逻辑（基于假设的文本结构）
+            current_volume = "未知卷"
+            current_rhyme = "未知韵"
+            current_tone = "平声" 
+            
+            lines = text_content.split('\n')
+            for line in lines:
+                line = line.strip()
+                if not line: continue
+                
+                # 尝试识别卷和韵（注：CText上的维基文本实际上是未经校对的乱码OCR，因此这里的正则很难完美匹配）
+                if line.startswith("卷"):
+                    current_volume = line
+                    continue
+                if "声" in line and len(line) < 5:
+                    current_tone = line
+                    continue
+                    
+                # 尝试分离词条和内容（假设词条在行首且长度<4）
+                parts = re.split(r'[:：\s]', line, maxsplit=1)
+                if len(parts) == 2:
+                    word = parts[0]
+                    content = parts[1]
+                else:
+                    word = line[0:2] if len(line) > 2 else line
+                    content = line[2:]
+                    
+                # 按用户要求格式构建字典
+                if word not in results:
+                    results[word] = []
+                    
+                results[word].append({
+                    "卷": current_volume,
+                    "大韵": current_rhyme,
+                    "声调": current_tone,
+                    "词条内容": content
+                })
+                
+        await browser.close()
+        
+    # 保存为JSON
+    with open("五车韵瑞_示例.json", "w", encoding="utf-8") as f:
+        json.dump(results, f, ensure_ascii=False, indent=2)
+        
+    print("提取完成。由于源数据本身存在大量OCR乱码，JSON数据可能需要大量人工清洗。")
+
+if __name__ == "__main__":
+    asyncio.run(scrape_ctext())