Update: 初学记、佩文韵府 and 五车韵瑞
This commit is contained in:
104
五车韵瑞/scrape_wuche.py
Normal file
104
五车韵瑞/scrape_wuche.py
Normal file
@@ -0,0 +1,104 @@
|
||||
import asyncio
|
||||
import json
|
||||
import re
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
CTEXT_INDEX_URL = "https://ctext.org/wiki.pl?if=gb&res=87723&remap=gb"
|
||||
|
||||
async def scrape_ctext():
|
||||
results = {}
|
||||
|
||||
# 启动Playwright(必须使用非无头模式以绕过反爬)
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=False)
|
||||
context = await browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
viewport={"width": 1280, "height": 800}
|
||||
)
|
||||
page = await context.new_page()
|
||||
|
||||
print(f"正在访问目录页: {CTEXT_INDEX_URL}")
|
||||
await page.goto(CTEXT_INDEX_URL, wait_until="domcontentloaded")
|
||||
|
||||
# 停顿等待Cloudflare盾
|
||||
await page.wait_for_timeout(5000)
|
||||
|
||||
# 提取所有章节链接
|
||||
links = await page.locator("a[href*='wiki.pl?if=gb&chapter=']").all()
|
||||
chapter_urls = []
|
||||
for link in links:
|
||||
href = await link.get_attribute("href")
|
||||
text = await link.inner_text()
|
||||
if href:
|
||||
full_url = "https://ctext.org/" + href
|
||||
if full_url not in chapter_urls:
|
||||
chapter_urls.append(full_url)
|
||||
|
||||
print(f"找到 {len(chapter_urls)} 个章节链接。")
|
||||
|
||||
# 遍历前几个章节作为示例
|
||||
for url in chapter_urls[:2]:
|
||||
print(f"正在抓取章节: {url}")
|
||||
await page.goto(url, wait_until="domcontentloaded")
|
||||
await page.wait_for_timeout(3000)
|
||||
|
||||
# 提取文本区
|
||||
text_content = await page.evaluate("""() => {
|
||||
const tds = document.querySelectorAll('td.ctext');
|
||||
let text = '';
|
||||
tds.forEach(td => { text += td.innerText + '\\n'; });
|
||||
return text;
|
||||
}""")
|
||||
|
||||
print("--- 抓取到的原始文本前100字 ---")
|
||||
print(text_content[:100])
|
||||
print("--------------------------------")
|
||||
|
||||
# 解析逻辑(基于假设的文本结构)
|
||||
current_volume = "未知卷"
|
||||
current_rhyme = "未知韵"
|
||||
current_tone = "平声"
|
||||
|
||||
lines = text_content.split('\n')
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if not line: continue
|
||||
|
||||
# 尝试识别卷和韵(注:CText上的维基文本实际上是未经校对的乱码OCR,因此这里的正则很难完美匹配)
|
||||
if line.startswith("卷"):
|
||||
current_volume = line
|
||||
continue
|
||||
if "声" in line and len(line) < 5:
|
||||
current_tone = line
|
||||
continue
|
||||
|
||||
# 尝试分离词条和内容(假设词条在行首且长度<4)
|
||||
parts = re.split(r'[::\s]', line, maxsplit=1)
|
||||
if len(parts) == 2:
|
||||
word = parts[0]
|
||||
content = parts[1]
|
||||
else:
|
||||
word = line[0:2] if len(line) > 2 else line
|
||||
content = line[2:]
|
||||
|
||||
# 按用户要求格式构建字典
|
||||
if word not in results:
|
||||
results[word] = []
|
||||
|
||||
results[word].append({
|
||||
"卷": current_volume,
|
||||
"大韵": current_rhyme,
|
||||
"声调": current_tone,
|
||||
"词条内容": content
|
||||
})
|
||||
|
||||
await browser.close()
|
||||
|
||||
# 保存为JSON
|
||||
with open("五车韵瑞_示例.json", "w", encoding="utf-8") as f:
|
||||
json.dump(results, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print("提取完成。由于源数据本身存在大量OCR乱码,JSON数据可能需要大量人工清洗。")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(scrape_ctext())
|
||||
Reference in New Issue
Block a user