From 288116310671e04a4ec1daf1842a8e5f8088cea7 Mon Sep 17 00:00:00 2001 From: denglifan Date: Sun, 22 Mar 2026 16:43:10 +0800 Subject: [PATCH] =?UTF-8?q?Update:=20=E5=88=A0=E9=99=A4=E5=9E=83=E5=9C=BE?= =?UTF-8?q?=E7=A8=8B=E5=BA=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 佩文韵府/check_siai.py | 8 -- 佩文韵府/check_v2.py | 7 -- 佩文韵府/download_sequentially.py | 71 ------------- 佩文韵府/extract_links.py | 110 -------------------- 佩文韵府/fix_final.py | 14 --- 佩文韵府/fix_pipes.py | 69 ------------- 佩文韵府/fix_remaining.py | 24 ----- 佩文韵府/fix_structure.py | 57 ----------- 佩文韵府/fix_volume.py | 28 ------ 佩文韵府/parse_all.py | 83 --------------- 佩文韵府/parse_all_v2.py | 79 --------------- 佩文韵府/parse_html_logic.py | 9 -- 佩文韵府/parser.py | 124 ----------------------- 佩文韵府/robust_download.py | 65 ------------ 佩文韵府/robust_download_aio.py | 65 ------------ 佩文韵府/robust_download_fast.py | 65 ------------ 佩文韵府/robust_download_final.py | 79 --------------- 佩文韵府/still_missing.txt | 96 ------------------ 佩文韵府/test_clean.py | 53 ---------- 佩文韵府/test_dump.py | 10 -- 佩文韵府/test_dump2.py | 12 --- 佩文韵府/test_empty_word.py | 5 - 佩文韵府/test_heuristic.py | 54 ---------- 佩文韵府/test_heuristic2.py | 37 ------- 佩文韵府/test_hybrid.py | 56 ----------- 佩文韵府/test_parser.py | 31 ------ 佩文韵府/test_placeholder.py | 49 --------- 佩文韵府/test_playwright.py | 46 --------- 佩文韵府/test_prefixes.py | 21 ---- 佩文韵府/test_specific.py | 13 --- 佩文韵府/test_split.py | 18 ---- 佩文韵府/test_strip.py | 32 ------ 佩文韵府/test_tokenize.py | 21 ---- 分类.md | 93 +++++++++++++++++ 初学记/generate_json.py | 161 ------------------------------ 初学记/test_extract.py | 24 ----- 初学记/test_extract2.py | 54 ---------- 初学记/test_extract3.py | 127 ----------------------- 初学记/test_parse.py | 35 ------- 初学记/test_parse2.py | 93 ----------------- 初学记/test_split.py | 41 -------- 41 files changed, 93 insertions(+), 2046 deletions(-) delete mode 100644 佩文韵府/check_siai.py delete mode 100644 佩文韵府/check_v2.py delete mode 100644 佩文韵府/download_sequentially.py delete mode 100644 佩文韵府/extract_links.py delete mode 100644 佩文韵府/fix_final.py delete mode 100644 佩文韵府/fix_pipes.py delete mode 100644 佩文韵府/fix_remaining.py delete mode 100644 佩文韵府/fix_structure.py delete mode 100644 佩文韵府/fix_volume.py delete mode 100644 佩文韵府/parse_all.py delete mode 100644 佩文韵府/parse_all_v2.py delete mode 100644 佩文韵府/parse_html_logic.py delete mode 100644 佩文韵府/parser.py delete mode 100644 佩文韵府/robust_download.py delete mode 100644 佩文韵府/robust_download_aio.py delete mode 100644 佩文韵府/robust_download_fast.py delete mode 100644 佩文韵府/robust_download_final.py delete mode 100644 佩文韵府/still_missing.txt delete mode 100644 佩文韵府/test_clean.py delete mode 100644 佩文韵府/test_dump.py delete mode 100644 佩文韵府/test_dump2.py delete mode 100644 佩文韵府/test_empty_word.py delete mode 100644 佩文韵府/test_heuristic.py delete mode 100644 佩文韵府/test_heuristic2.py delete mode 100644 佩文韵府/test_hybrid.py delete mode 100644 佩文韵府/test_parser.py delete mode 100644 佩文韵府/test_placeholder.py delete mode 100644 佩文韵府/test_playwright.py delete mode 100644 佩文韵府/test_prefixes.py delete mode 100644 佩文韵府/test_specific.py delete mode 100644 佩文韵府/test_split.py delete mode 100644 佩文韵府/test_strip.py delete mode 100644 佩文韵府/test_tokenize.py delete mode 100644 初学记/generate_json.py delete mode 100644 初学记/test_extract.py delete mode 100644 初学记/test_extract2.py delete mode 100644 初学记/test_extract3.py delete mode 100644 初学记/test_parse.py delete mode 100644 初学记/test_parse2.py delete mode 100644 初学记/test_split.py diff --git a/佩文韵府/check_siai.py b/佩文韵府/check_siai.py deleted file mode 100644 index 77aa055..0000000 --- a/佩文韵府/check_siai.py +++ /dev/null @@ -1,8 +0,0 @@ -import json - -with open('peiwenyunfu.json', 'r', encoding='utf-8') as f: - data = json.load(f) - -siai_citiao = data.get("𩅰", {}).get("词条", {}) -for k, v in list(siai_citiao.items())[:20]: - print(f"{k}: {v[:30]}...") diff --git a/佩文韵府/check_v2.py b/佩文韵府/check_v2.py deleted file mode 100644 index 7692359..0000000 --- a/佩文韵府/check_v2.py +++ /dev/null @@ -1,7 +0,0 @@ -import json -with open('peiwenyunfu_v2.json', 'r', encoding='utf-8') as f: - data = json.load(f) - -for k in ['𩅰', '桵', '离', '梩']: - print(f"Rhyme: {k}") - print(json.dumps(data.get(k, {}), ensure_ascii=False, indent=2)) diff --git a/佩文韵府/download_sequentially.py b/佩文韵府/download_sequentially.py deleted file mode 100644 index 1b9c505..0000000 --- a/佩文韵府/download_sequentially.py +++ /dev/null @@ -1,71 +0,0 @@ -import os -import time -import urllib.request -import urllib.parse -from urllib.error import HTTPError - -def main(): - if not os.path.exists("html_files"): - os.makedirs("html_files") - - with open("still_missing.txt", "r", encoding="utf-8") as f: - urls = [line.strip() for line in f if line.strip()] - - proxy_support = urllib.request.ProxyHandler({'http': 'http://127.0.0.1:10808', 'https': 'http://127.0.0.1:10808'}) - opener = urllib.request.build_opener(proxy_support) - urllib.request.install_opener(opener) - - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8', - 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', - 'Referer': 'https://zh.wikisource.org/' - } - - count = 0 - total = len(urls) - - for url in urls: - vol = urllib.parse.unquote(url).split('/')[-1] - filename = f'html_files/{vol}.html' - - if os.path.exists(filename): - print(f"[{count+1}/{total}] Skipping {vol} (exists)") - count += 1 - continue - - print(f"[{count+1}/{total}] Downloading {vol}...") - - success = False - for attempt in range(5): - try: - req = urllib.request.Request(url, headers=headers) - response = urllib.request.urlopen(req, timeout=15) - html = response.read().decode('utf-8') - - with open(filename, 'w', encoding='utf-8') as out_f: - out_f.write(html) - - print(f" -> Success!") - success = True - break - - except HTTPError as e: - if e.code == 403: - print(f" -> 403 Forbidden. Waiting 5 seconds...") - time.sleep(5) - else: - print(f" -> HTTP Error {e.code}") - except Exception as e: - print(f" -> Error: {e}") - - time.sleep(2) - - if not success: - print(f" -> Failed all attempts.") - - count += 1 - time.sleep(1) # Be nice to server - -if __name__ == "__main__": - main() diff --git a/佩文韵府/extract_links.py b/佩文韵府/extract_links.py deleted file mode 100644 index 4278fc8..0000000 --- a/佩文韵府/extract_links.py +++ /dev/null @@ -1,110 +0,0 @@ -import urllib.request -from bs4 import BeautifulSoup -import urllib.parse -import re -import os - -url = "https://api.allorigins.win/raw?url=https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)" - -def chinese_to_arabic(cn_str): - cn_num = { - "零": 0, "〇": 0, "一": 1, "二": 2, "三": 3, "四": 4, - "五": 5, "六": 6, "七": 7, "八": 8, "九": 9, "十": 10, - "百": 100, "千": 1000, - } - result = 0 - temp = 0 - for char in cn_str: - if char in ["百", "千"]: - if temp == 0: - temp = 1 - result += temp * cn_num[char] - temp = 0 - elif char == "十": - if temp == 0: - temp = 1 - if len(cn_str) == 1: - return 10 - elif result == 0 and temp == 1 and cn_str[0] == "十": - result += 10 - temp = 0 - else: - result += temp * cn_num[char] - temp = 0 - else: - temp = cn_num.get(char, 0) - result += temp - return result - -def get_filename(vol_str): - m = re.match(r"卷(.+?)之(.+)", vol_str) - if m: - v1 = chinese_to_arabic(m.group(1)) - v2 = chinese_to_arabic(m.group(2)) - return f"卷{v1:03d}之{v2}.html" - m = re.match(r"卷(.+)", vol_str) - if m: - v1 = chinese_to_arabic(m.group(1)) - return f"卷{v1:03d}.html" - return vol_str + ".html" - -def main(): - try: - req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"}) - with urllib.request.urlopen(req, timeout=30) as response: - html = response.read().decode("utf-8") - except Exception as e: - print("Failed to fetch:", e) - # To avoid failing the test if the network is down but it mocks something else - # we can just pass here, but normally a mocked urllib would work. - pass - - try: - soup = BeautifulSoup(html, "html.parser") - except NameError: - return - - html_dir = "/mnt/fast/private/denglifan/workspace/spider-ctext/佩文韵府/html_files/" - existing_files = set(os.listdir(html_dir)) if os.path.exists(html_dir) else set() - - missing_urls = [] - seen_urls = set() - base_url = "https://zh.wikisource.org" - - for a in soup.find_all("a"): - href = a.get("href") - if not href: - continue - unquoted_href = urllib.parse.unquote(href) - - # Only include `卷XXX之Y` links (ignore 全览 files) - if "御定佩文韻府" in unquoted_href and "四庫全書本" in unquoted_href and "/卷" in unquoted_href: - title_part = unquoted_href.split("/")[-1] - - if "全覽" in title_part or "全览" in title_part: - continue - - # Filter for `卷XXX之Y` pattern if strictly needed. - # But let's check regex pattern "卷.+?之.+" - if not re.match(r"卷.+?之.+", title_part): - continue - - full_url = urllib.parse.urljoin(base_url, href) - full_url = full_url.split('#')[0] - - if full_url in seen_urls: - continue - seen_urls.add(full_url) - - filename = get_filename(title_part) - if filename not in existing_files: - missing_urls.append(full_url) - - with open("missing_urls.txt", "w", encoding="utf-8") as f: - for u in missing_urls: - f.write(u + "\n") - - print(f"Found {len(missing_urls)} missing URLs.") - -if __name__ == "__main__": - main() diff --git a/佩文韵府/fix_final.py b/佩文韵府/fix_final.py deleted file mode 100644 index 5034682..0000000 --- a/佩文韵府/fix_final.py +++ /dev/null @@ -1,14 +0,0 @@ -import json - -with open('peiwenyunfu.json', 'r', encoding='utf-8') as f: - data = json.load(f) - -for rhyme, r_data in data.items(): - if rhyme in ["metadata", "preface"]: continue - if "词条" in r_data: - for word, content in r_data["词条"].items(): - if "丨" in content: - r_data["词条"][word] = content.replace("丨", rhyme) - -with open('peiwenyunfu.json', 'w', encoding='utf-8') as f: - json.dump(data, f, ensure_ascii=False, indent=4) diff --git a/佩文韵府/fix_pipes.py b/佩文韵府/fix_pipes.py deleted file mode 100644 index 5cf5773..0000000 --- a/佩文韵府/fix_pipes.py +++ /dev/null @@ -1,69 +0,0 @@ -import json -import re - -print("Loading peiwenyunfu.json...") -with open('peiwenyunfu.json', 'r', encoding='utf-8') as f: - data = json.load(f) - -prefixes = ["韻藻", "增", "増"] - -def clean_headword(word): - clean_word = word - # Try stripping prefixes - for _ in range(2): # In case there's "増韻藻" or something - for p in prefixes: - if clean_word.startswith(p) and len(clean_word) > len(p): - clean_word = clean_word[len(p):] - return clean_word - -def replace_pipes_in_content(content, word): - clean_word = clean_headword(word) - word_len = len(clean_word) - - if word_len == 0 or "丨" not in content: - return content - - def repl(match): - nonlocal pipe_idx - block = match.group(0) - block_len = len(block) - - if block_len % word_len == 0: - # Full word match! Reset alignment. - pipe_idx = 0 - return clean_word * (block_len // word_len) - else: - # Partial word match. Use current sequence. - res = "" - for _ in range(block_len): - res += clean_word[pipe_idx % word_len] - pipe_idx += 1 - return res - - pipe_idx = 0 - return re.sub(r'丨+', repl, content) - -print("Processing...") -for rhyme, r_data in data.items(): - if rhyme in ["metadata", "preface"]: - continue - - # 1. Fix 小韵描述 - if "小韵描述" in r_data and r_data["小韵描述"]: - # The placeholder should be replaced by the rhyme char - # BUT wait! The rhyme char might be simplified in our dictionary keys! - # The user's prompt used "东" for replacement in 小韵描述. - # So we just use the dictionary key `rhyme`. - r_data["小韵描述"] = r_data["小韵描述"].replace("丨", rhyme) - - # 2. Fix 词条 - if "词条" in r_data: - new_citiao = {} - for word, content in r_data["词条"].items(): - new_citiao[word] = replace_pipes_in_content(content, word) - r_data["词条"] = new_citiao - -print("Saving peiwenyunfu.json...") -with open('peiwenyunfu.json', 'w', encoding='utf-8') as f: - json.dump(data, f, ensure_ascii=False, indent=4) -print("Done!") diff --git a/佩文韵府/fix_remaining.py b/佩文韵府/fix_remaining.py deleted file mode 100644 index abbeb76..0000000 --- a/佩文韵府/fix_remaining.py +++ /dev/null @@ -1,24 +0,0 @@ -import json - -print("Loading...") -with open('peiwenyunfu.json', 'r', encoding='utf-8') as f: - data = json.load(f) - -for rhyme, r_data in data.items(): - if rhyme in ["metadata", "preface"]: continue - - if "对语" in r_data and r_data["对语"]: - r_data["对语"] = r_data["对语"].replace("丨", rhyme) - if "摘句" in r_data and r_data["摘句"]: - r_data["摘句"] = r_data["摘句"].replace("丨", rhyme) - - if "词条" in r_data: - new_citiao = {} - for word, content in r_data["词条"].items(): - new_word = word.replace("丨", rhyme) - new_citiao[new_word] = content - r_data["词条"] = new_citiao - -with open('peiwenyunfu.json', 'w', encoding='utf-8') as f: - json.dump(data, f, ensure_ascii=False, indent=4) -print("Done!") diff --git a/佩文韵府/fix_structure.py b/佩文韵府/fix_structure.py deleted file mode 100644 index 1a45f29..0000000 --- a/佩文韵府/fix_structure.py +++ /dev/null @@ -1,57 +0,0 @@ -import json - -print("Loading...") -with open('peiwenyunfu.json', 'r', encoding='utf-8') as f: - data = json.load(f) - -new_data = {} -new_data['metadata'] = data['metadata'] -new_data['preface'] = data['preface'] - -prefixes = ["韻藻", "韵藻", "増", "增"] - -def clean_key(k): - changed = True - while changed: - changed = False - for p in prefixes: - if k.startswith(p) and len(k) > len(p): - k = k[len(p):] - changed = True - return k - -for rhyme, r_data in data.items(): - if rhyme in ['metadata', 'preface']: continue - - # Store the original main rhyme - new_data[rhyme] = { - "卷": r_data["卷"], - "声": r_data["声"], - "韵": r_data["韵"], - "小韵描述": r_data["小韵描述"], - "韵藻": {}, - "对语": r_data.get("对语", ""), - "摘句": r_data.get("摘句", "") - } - - current_rhyme = rhyme - - for k, v in r_data.get("词条", {}).items(): - if len(k) == 1 and any(x in v[:15] for x in ['切', '説文', '廣韻', '玉篇', '集韻', '韻㑹', '音', '同', '釋名', '爾雅']): - current_rhyme = k - new_data[current_rhyme] = { - "卷": r_data["卷"], - "声": r_data["声"], - "韵": r_data["韵"], - "小韵描述": k + v, - "韵藻": {}, - "对语": "", - "摘句": "" - } - else: - cleaned = clean_key(k) - new_data[current_rhyme]["韵藻"][cleaned] = v - -with open('peiwenyunfu_v2.json', 'w', encoding='utf-8') as f: - json.dump(new_data, f, ensure_ascii=False, indent=4) -print(f"Old size: {len(data)}, New size: {len(new_data)}") diff --git a/佩文韵府/fix_volume.py b/佩文韵府/fix_volume.py deleted file mode 100644 index b8088d7..0000000 --- a/佩文韵府/fix_volume.py +++ /dev/null @@ -1,28 +0,0 @@ -import json - -print("Loading peiwenyunfu.json...") -with open('peiwenyunfu.json', 'r', encoding='utf-8') as f: - data = json.load(f) - -current_vol = "" -fixed_count = 0 - -for rhyme, r_data in data.items(): - if rhyme in ['metadata', 'preface']: - continue - - vol = r_data.get("卷", "") - if vol.strip(): - # Update current valid volume - current_vol = vol.strip() - else: - # If volume is empty, use the current_vol - if current_vol: - r_data["卷"] = current_vol - fixed_count += 1 - -print(f"Fixed {fixed_count} missing volumes.") - -with open('peiwenyunfu.json', 'w', encoding='utf-8') as f: - json.dump(data, f, ensure_ascii=False, indent=4) -print("Saved to peiwenyunfu.json!") diff --git a/佩文韵府/parse_all.py b/佩文韵府/parse_all.py deleted file mode 100644 index 3b196af..0000000 --- a/佩文韵府/parse_all.py +++ /dev/null @@ -1,83 +0,0 @@ -import os -import glob -import json -import re -from parser import parse_html - -def natural_sort_key(s): - basename = os.path.basename(s) - # Match "卷XXX之YY" where XXX and YY can be numbers or Chinese numerals - # Actually they are already digits in most cases, e.g., "卷001之1.html" or "卷106之4.html" - m = re.search(r"卷(\d+)(?:之(\d+))?", basename) - if m: - vol = int(m.group(1)) - sub = int(m.group(2)) if m.group(2) else 0 - return (1, vol, sub) - # fallback - return (2, 0, 0) - -def main(): - # Only pick files that start with "卷" to avoid "全覽" duplicate aggregations - files = glob.glob('html_files/卷*.html') - files.sort(key=natural_sort_key) - - print(f"Starting to parse {len(files)} files...") - - combined_result = {} - success_count = 0 - fail_count = 0 - - for idx, fpath in enumerate(files): - try: - res = parse_html(fpath) - for k, v in res.items(): - # Remove "(韵母)" prefix - clean_key = k.replace("(韵母)", "") - - if clean_key not in combined_result: - combined_result[clean_key] = v - else: - # Merge entries - combined_result[clean_key]["词条"].update(v.get("词条", {})) - if "对语" in v and v["对语"]: - combined_result[clean_key]["对语"] += v["对语"] - if "摘句" in v and v["摘句"]: - combined_result[clean_key]["摘句"] += v["摘句"] - - # Also, if the initial file didn't have "卷" properly parsed, update it - if not combined_result[clean_key]["卷"] and v["卷"]: - combined_result[clean_key]["卷"] = v["卷"] - if not combined_result[clean_key]["声"] and v["声"]: - combined_result[clean_key]["声"] = v["声"] - if not combined_result[clean_key]["韵"] and v["韵"]: - combined_result[clean_key]["韵"] = v["韵"] - - success_count += 1 - if idx % 50 == 0: - print(f"Parsed {idx}/{len(files)} files...") - except Exception as e: - print(f"Failed to parse {fpath}: {e}") - fail_count += 1 - - print(f"Parsing finished. Success: {success_count}, Failed: {fail_count}") - print(f"Total unique rhyme characters extracted: {len(combined_result)}") - - # Construct final output with metadata - final_output = { - "metadata": { - "title": "御定佩文韵府", - "author": "张玉书等", - "dynasty": "清", - "total_volumes": 106, - "source": "2026年3月22日从维基文库导出" - }, - "preface": "", - "content": combined_result - } - - with open('peiwenyunfu.json', 'w', encoding='utf-8') as f: - json.dump(final_output, f, ensure_ascii=False, indent=4) - print("Saved output to peiwenyunfu.json") - -if __name__ == "__main__": - main() diff --git a/佩文韵府/parse_all_v2.py b/佩文韵府/parse_all_v2.py deleted file mode 100644 index 7c5bf75..0000000 --- a/佩文韵府/parse_all_v2.py +++ /dev/null @@ -1,79 +0,0 @@ -import os -import glob -import json -import re -from parser import parse_html - -def natural_sort_key(s): - basename = os.path.basename(s) - # Match "卷XXX之YY" - m = re.search(r"卷(\d+)(?:之(\d+))?", basename) - if m: - vol = int(m.group(1)) - sub = int(m.group(2)) if m.group(2) else 0 - return (1, vol, sub) - return (2, 0, 0) - -def main(): - files = glob.glob('html_files/卷*.html') - files.sort(key=natural_sort_key) - - print(f"Starting to parse {len(files)} files...") - - combined_result = {} - success_count = 0 - fail_count = 0 - - for idx, fpath in enumerate(files): - try: - res = parse_html(fpath) - for k, v in res.items(): - # Remove "(韵母)" prefix - clean_key = k.replace("(韵母)", "") - - if clean_key not in combined_result: - combined_result[clean_key] = v - else: - # Merge entries - combined_result[clean_key]["词条"].update(v.get("词条", {})) - if "对语" in v and v["对语"]: - combined_result[clean_key]["对语"] += v["对语"] - if "摘句" in v and v["摘句"]: - combined_result[clean_key]["摘句"] += v["摘句"] - - if not combined_result[clean_key]["卷"] and v["卷"]: - combined_result[clean_key]["卷"] = v["卷"] - if not combined_result[clean_key]["声"] and v["声"]: - combined_result[clean_key]["声"] = v["声"] - if not combined_result[clean_key]["韵"] and v["韵"]: - combined_result[clean_key]["韵"] = v["韵"] - - success_count += 1 - if idx % 50 == 0: - print(f"Parsed {idx}/{len(files)} files...") - except Exception as e: - print(f"Failed to parse {fpath}: {e}") - fail_count += 1 - - print(f"Parsing finished. Success: {success_count}, Failed: {fail_count}") - print(f"Total unique rhyme characters extracted: {len(combined_result)}") - - final_output = { - "metadata": { - "title": "御定佩文韵府", - "author": "张玉书等奉敕编", - "dynasty": "清", - "total_volumes": 106, - "source": "2026年3月22日从维基文库导出" - }, - "preface": "" - } - - final_output.update(combined_result) - - with open('peiwenyunfu.json', 'w', encoding='utf-8') as f: - json.dump(final_output, f, ensure_ascii=False, indent=4) - print("Saved output to peiwenyunfu.json") - -if __name__ == "__main__": - main() diff --git a/佩文韵府/parse_html_logic.py b/佩文韵府/parse_html_logic.py deleted file mode 100644 index c0e140f..0000000 --- a/佩文韵府/parse_html_logic.py +++ /dev/null @@ -1,9 +0,0 @@ -def simplify_char(c): - mapping = {'東': '东', '銅': '铜', '童': '童'} # add others if needed - return mapping.get(c, c) - -num_map = {'一':'1', '二':'2', '三':'3', '四':'4', '五':'5', '六':'6', '七':'7', '八':'8', '九':'9'} - -# "01之一" -# If vol_m = re.search(r"卷(.)之(.)", text) -# "0" + num_map[vol_m.group(1)] + "之" + vol_m.group(2) diff --git a/佩文韵府/parser.py b/佩文韵府/parser.py deleted file mode 100644 index 78d6c91..0000000 --- a/佩文韵府/parser.py +++ /dev/null @@ -1,124 +0,0 @@ -import re -from bs4 import BeautifulSoup - -def simplify(text): - mapping = { - '東': '东', - '聲': '声', - '韻': '韵', - '詞': '词', - '條': '条', - '對': '对', - '語': '语', - '摘': '摘', - '句': '句', - '卷': '卷', - '紅': '红', - '銅': '铜', - } - for k, v in mapping.items(): - text = text.replace(k, v) - return text - -def parse_html(file_path): - with open(file_path, "r", encoding="utf-8") as f: - soup = BeautifulSoup(f.read(), "html.parser") - - poem_div = soup.find("div", class_="poem") - if not poem_div: - return {} - - text = poem_div.get_text() - - # Extract Tone and Rhyme - tone_rhyme_m = re.search(r"(上平聲|下平聲|上聲|去聲|入聲)\s+(.*?韻.*)", text) - current_tone = simplify(tone_rhyme_m.group(1)) if tone_rhyme_m else "" - - if tone_rhyme_m: - raw_rhyme = tone_rhyme_m.group(2).strip() - m_rhyme = re.match(r"(.*?)[韻]", raw_rhyme) - current_rhyme = simplify(m_rhyme.group(1)) if m_rhyme else simplify(raw_rhyme.replace('韻', '')) - else: - current_rhyme = "" - - # Extract Volume - vol_m = re.search(r"卷([一二三四五六七八九十]+)之(.+)", text) - if vol_m: - num_map = {'一':'1', '二':'2', '三':'3', '四':'4', '五':'5', '六':'6', '七':'7', '八':'8', '九':'9', '十':'10'} - v1 = vol_m.group(1) - v1_digit = "".join(num_map.get(c, c) for c in v1) - if len(v1_digit) == 1: - v1_str = f"0{v1_digit}" - else: - v1_str = v1_digit - # Fix: the volume match might capture extra text after '之', e.g. '之一\n' - v2 = vol_m.group(2).split('\n')[0].strip() - current_vol = f"{v1_str}之{v2}" - else: - current_vol = "" - - # Extract chars - lines = text.split('\n') - rhyme_chars = [] - for i, line in enumerate(lines): - if tone_rhyme_m and tone_rhyme_m.group(2) in line: - chars_line = lines[i+1] - rhyme_chars = [c for c in chars_line.replace(' ', ' ').split() if len(c) == 1] - break - - clean_lines = [] - for line in lines: - stripped = line.strip() - if not stripped: continue - if stripped == "欽定四庫全書": continue - if stripped.startswith("御定佩文韻府"): continue - if "平聲" in stripped or "上聲" in stripped or "去聲" in stripped or "入聲" in stripped: - if "韻" in stripped: continue - if all(c in rhyme_chars or c in '  ' for c in stripped): - continue - clean_lines.append(stripped) - - clean_text = "".join(clean_lines) - - tokens = re.findall(r"([^〈〉]*)((?:〈[^〉]+〉)+)", clean_text) - - result = {} - current_char = None - - for word, desc_blocks in tokens: - word = word.strip() - desc_content = desc_blocks.replace('〈', '').replace('〉', '') - - # Is this a main character definition? - if word in rhyme_chars: - current_char = word - simplified_char = simplify(current_char) - # Create a new entry in result - key = f"(韵母){simplified_char}" - if key not in result: - result[key] = { - "卷": current_vol, - "声": current_tone, - "韵": current_rhyme, - "小韵描述": simplify(current_char + desc_content), - "词条": {}, - "对语": "", - "摘句": "" - } - elif word == "對語" or word == "对语": - if current_char: - key = f"(韵母){simplify(current_char)}" - # Could be multiple parts, though usually one per character block - result[key]["对语"] += desc_content - elif word == "摘句": - if current_char: - key = f"(韵母){simplify(current_char)}" - result[key]["摘句"] += desc_content - else: - # It's a 词条 - if current_char and word: - key = f"(韵母){simplify(current_char)}" - result[key]["词条"][word] = desc_content - - return result - diff --git a/佩文韵府/robust_download.py b/佩文韵府/robust_download.py deleted file mode 100644 index d6fa963..0000000 --- a/佩文韵府/robust_download.py +++ /dev/null @@ -1,65 +0,0 @@ -import os -import time -import urllib.parse -import requests -from requests.exceptions import RequestException - - -def main(): - if not os.path.exists("html_files"): - os.makedirs("html_files") - - with open("missing_urls.txt", "r", encoding="utf-8") as f: - urls = [line.strip() for line in f if line.strip()] - - proxies = {"http": "http://127.0.0.1:10808", "https": "http://127.0.0.1:10808"} - - headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" - } - - for url in urls: - # Decode the URL to get the original characters - decoded_url = urllib.parse.unquote(url) - # Extract the volume name, e.g., 卷001之1 - volume_name = decoded_url.split("/")[-1] - - filename = f"html_files/{volume_name}.html" - - if os.path.exists(filename): - print(f"Skipping {filename}, already exists.") - continue - - print(f"Downloading {volume_name} from {url}...") - - success = False - retries = 3 - while not success and retries > 0: - try: - response = requests.get( - url, headers=headers, proxies=proxies, timeout=15 - ) - if response.status_code == 200: - with open(filename, "w", encoding="utf-8") as out_f: - out_f.write(response.text) - success = True - print(f"Successfully downloaded {filename}") - else: - print( - f"HTTP Error {response.status_code} for {url}. Retries left: {retries - 1}" - ) - except RequestException as e: - print(f"Error downloading {url}: {e}. Retries left: {retries - 1}") - - if not success: - retries -= 1 - time.sleep(2) # Delay before retry - - if not success: - print(f"Failed to download {url} after all retries.") - - time.sleep(0.5) # Small delay between requests - - -if __name__ == "__main__": - main() diff --git a/佩文韵府/robust_download_aio.py b/佩文韵府/robust_download_aio.py deleted file mode 100644 index a92ba57..0000000 --- a/佩文韵府/robust_download_aio.py +++ /dev/null @@ -1,65 +0,0 @@ -import asyncio -import aiohttp -from aiohttp_socks import ProxyConnector, ProxyType -import os -import urllib.parse -import time - - -async def download_file(session, url, filename, semaphore): - async with semaphore: - if os.path.exists(filename): - print(f"Skipping {filename}") - return True - - retries = 3 - while retries > 0: - try: - async with session.get(url, timeout=30) as response: - if response.status == 200: - content = await response.read() - with open(filename, "wb") as f: - f.write(content) - print(f"Successfully downloaded {filename}") - return True - else: - print(f"HTTP Error {response.status} for {url}") - except Exception as e: - print(f"Error for {url}: {e}") - - retries -= 1 - await asyncio.sleep(2) - - print(f"Failed all retries for {url}") - return False - - -async def main(): - if not os.path.exists("html_files"): - os.makedirs("html_files") - - with open("missing_urls.txt", "r", encoding="utf-8") as f: - urls = [line.strip() for line in f if line.strip()] - - # To satisfy the instruction: SOCKS5 proxy 127.0.0.1:10808 - # aiohttp_socks uses socks5:// instead of socks5h:// but with rdns=True it is equivalent. - proxy_url = "socks5://127.0.0.1:10808" - connector = ProxyConnector.from_url(proxy_url, rdns=True) - - headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"} - - semaphore = asyncio.Semaphore(10) # 10 concurrent downloads - - async with aiohttp.ClientSession(connector=connector, headers=headers) as session: - tasks = [] - for url in urls: - decoded_url = urllib.parse.unquote(url) - volume_name = decoded_url.split("/")[-1] - filename = f"html_files/{volume_name}.html" - tasks.append(download_file(session, url, filename, semaphore)) - - await asyncio.gather(*tasks) - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/佩文韵府/robust_download_fast.py b/佩文韵府/robust_download_fast.py deleted file mode 100644 index ea51481..0000000 --- a/佩文韵府/robust_download_fast.py +++ /dev/null @@ -1,65 +0,0 @@ -import os -import time -import urllib.parse -import requests -from requests.exceptions import RequestException -import concurrent.futures - -def download_url(url): - proxies = {"http": "http://127.0.0.1:10808", "https": "http://127.0.0.1:10808"} - headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" - } - - decoded_url = urllib.parse.unquote(url) - volume_name = decoded_url.split("/")[-1] - filename = f"html_files/{volume_name}.html" - - if os.path.exists(filename): - # Already exists - return True, filename - - success = False - retries = 3 - while not success and retries > 0: - try: - response = requests.get(url, headers=headers, proxies=proxies, timeout=15) - if response.status_code == 200: - with open(filename, "w", encoding="utf-8") as out_f: - out_f.write(response.text) - success = True - print(f"Successfully downloaded {filename}") - return True, filename - else: - print(f"HTTP Error {response.status_code} for {url}. Retries left: {retries - 1}") - except RequestException as e: - pass - - if not success: - retries -= 1 - time.sleep(2) - - if not success: - print(f"Failed to download {url} after all retries.") - return False, filename - - return False, filename - -def main(): - if not os.path.exists("html_files"): - os.makedirs("html_files") - - with open("missing_urls.txt", "r", encoding="utf-8") as f: - urls = [line.strip() for line in f if line.strip()] - - # Use ThreadPoolExecutor to download multiple files concurrently - with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor: - futures = [executor.submit(download_url, url) for url in urls] - for future in concurrent.futures.as_completed(futures): - try: - future.result() - except Exception as e: - print(f"Exception during download: {e}") - -if __name__ == "__main__": - main() diff --git a/佩文韵府/robust_download_final.py b/佩文韵府/robust_download_final.py deleted file mode 100644 index 06e6f0e..0000000 --- a/佩文韵府/robust_download_final.py +++ /dev/null @@ -1,79 +0,0 @@ -import os -import time -import urllib.parse -import requests -from requests.exceptions import RequestException -import concurrent.futures - - -def download_url(url): - proxies = {"http": "http://127.0.0.1:10808", "https": "http://127.0.0.1:10808"} - headers = { - "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36", - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8", - "Accept-Language": "en-US,en;q=0.9", - } - - decoded_url = urllib.parse.unquote(url) - volume_name = decoded_url.split("/")[-1] - filename = f"html_files/{volume_name}.html" - - if os.path.exists(filename): - return True, filename - - success = False - retries = 5 - while not success and retries > 0: - try: - response = requests.get(url, headers=headers, proxies=proxies, timeout=15) - if response.status_code == 200: - with open(filename, "w", encoding="utf-8") as out_f: - out_f.write(response.text) - success = True - print(f"Successfully downloaded {filename}") - return True, filename - elif response.status_code == 403: - print( - f"HTTP Error 403 for {url}. Waiting longer to avoid rate limit..." - ) - time.sleep(5) - else: - print( - f"HTTP Error {response.status_code} for {url}. Retries left: {retries - 1}" - ) - except RequestException as e: - pass - - if not success: - retries -= 1 - time.sleep(3) - - if not success: - print(f"Failed to download {url} after all retries.") - return False, filename - - return False, filename - - -def main(): - if not os.path.exists("html_files"): - os.makedirs("html_files") - - with open("still_missing.txt", "r", encoding="utf-8") as f: - urls = [line.strip() for line in f if line.strip()] - - print(f"Starting download for {len(urls)} remaining files...") - - # Use ThreadPoolExecutor to download multiple files concurrently - # Reduced max_workers to 5 to avoid triggering 403 Forbidden rate limits - with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: - futures = [executor.submit(download_url, url) for url in urls] - for future in concurrent.futures.as_completed(futures): - try: - future.result() - except Exception as e: - print(f"Exception during download: {e}") - - -if __name__ == "__main__": - main() diff --git a/佩文韵府/still_missing.txt b/佩文韵府/still_missing.txt deleted file mode 100644 index e02ed7d..0000000 --- a/佩文韵府/still_missing.txt +++ /dev/null @@ -1,96 +0,0 @@ -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7070%E4%B9%8B4 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7073%E4%B9%8B1 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7073%E4%B9%8B2 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7073%E4%B9%8B3 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7074%E4%B9%8B4 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7074%E4%B9%8B5 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7077%E4%B9%8B4 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7081%E4%B9%8B1 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7081%E4%B9%8B2 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7081%E4%B9%8B3 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7082%E4%B9%8B4 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7082%E4%B9%8B5 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7083%E4%B9%8B1 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7083%E4%B9%8B2 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7083%E4%B9%8B3 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7083%E4%B9%8B4 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7084%E4%B9%8B1 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7084%E4%B9%8B2 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7085%E4%B9%8B1 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7085%E4%B9%8B2 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7085%E4%B9%8B3 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7085%E4%B9%8B4 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B02 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B03 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B04 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B05 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B06 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B07 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B08 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B09 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B10 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7091%E4%B9%8B1 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7091%E4%B9%8B2 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7091%E4%B9%8B3 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7091%E4%B9%8B4 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7092%E4%B9%8B1 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7092%E4%B9%8B2 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7092%E4%B9%8B4 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7093%E4%B9%8B1 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7093%E4%B9%8B2 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7093%E4%B9%8B6 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7093%E4%B9%8B7 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7093%E4%B9%8B8 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7094%E4%B9%8B1 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7094%E4%B9%8B2 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7095%E4%B9%8B1 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7095%E4%B9%8B2 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7095%E4%B9%8B3 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7095%E4%B9%8B4 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7095%E4%B9%8B5 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7095%E4%B9%8B6 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7096%E4%B9%8B1 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7096%E4%B9%8B2 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7096%E4%B9%8B3 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7098%E4%B9%8B1 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7098%E4%B9%8B2 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7098%E4%B9%8B4 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7098%E4%B9%8B5 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7099%E4%B9%8B2 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7099%E4%B9%8B3 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7099%E4%B9%8B5 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7099%E4%B9%8B7 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7099%E4%B9%8B8 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7099%E4%B9%8B9 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7099%E4%B9%8B010 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B1 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B2 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B3 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B4 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B6 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B7 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B8 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B9 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B010 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B011 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B012 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7101%E4%B9%8B1 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7101%E4%B9%8B2 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7101%E4%B9%8B3 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7101%E4%B9%8B4 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7102%E4%B9%8B1 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7102%E4%B9%8B2 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7102%E4%B9%8B3 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7102%E4%B9%8B4 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7102%E4%B9%8B5 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7102%E4%B9%8B6 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7102%E4%B9%8B7 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7102%E4%B9%8B8 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7103%E4%B9%8B1 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7103%E4%B9%8B2 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7104%E4%B9%8B1 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7104%E4%B9%8B2 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7105%E4%B9%8B1 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7105%E4%B9%8B2 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7105%E4%B9%8B3 -https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7106%E4%B9%8B2 diff --git a/佩文韵府/test_clean.py b/佩文韵府/test_clean.py deleted file mode 100644 index dbb3e8f..0000000 --- a/佩文韵府/test_clean.py +++ /dev/null @@ -1,53 +0,0 @@ -import re -from bs4 import BeautifulSoup - -with open("html_files/卷001之1.html", "r", encoding="utf-8") as f: - soup = BeautifulSoup(f.read(), "html.parser") - -text = soup.find("div", class_="poem").get_text() - -# Extract Tone and Rhyme -tone_rhyme_m = re.search(r"(上平聲|下平聲|上聲|去聲|入聲)\s+(.*?韻.*)", text) -if tone_rhyme_m: - print("Tone:", tone_rhyme_m.group(1)) - print("Rhyme:", tone_rhyme_m.group(2)) - -# Extract Volume -vol_m = re.search(r"卷(.)之(.)", text) -if vol_m: - print("Volume:", f"0{vol_m.group(1)}之{vol_m.group(2)}" if vol_m.group(1) in '一二三四五六七八九' else f"{vol_m.group(1)}之{vol_m.group(2)}") - -# Extract chars -lines = text.split('\n') -rhyme_chars = [] -for i, line in enumerate(lines): - if tone_rhyme_m.group(2) in line: - # the next line usually has the characters - chars_line = lines[i+1] - rhyme_chars = [c for c in chars_line.replace(' ', ' ').split() if len(c) == 1] - break - -print("Chars:", rhyme_chars) - -# Now, we want to strip all lines that are headers. -# Actually, headers repeat: "欽定四庫全書", "御定佩文韻府...", "上平聲..." -# We can just filter out these known header lines! -clean_lines = [] -for line in lines: - stripped = line.strip() - if not stripped: continue - if stripped == "欽定四庫全書": continue - if stripped.startswith("御定佩文韻府"): continue - if "平聲" in stripped or "上聲" in stripped or "去聲" in stripped or "入聲" in stripped: - if "韻" in stripped: continue - # Is it the chars line? - if all(c in rhyme_chars or c in '  ' for c in stripped): - continue - clean_lines.append(stripped) - -clean_text = "".join(clean_lines) -print("Start of clean text:", clean_text[:100]) - -tokens = re.findall(r"([^〈〉]*)((?:〈[^〉]+〉)+)", clean_text) -print("First token:", tokens[0]) - diff --git a/佩文韵府/test_dump.py b/佩文韵府/test_dump.py deleted file mode 100644 index d637757..0000000 --- a/佩文韵府/test_dump.py +++ /dev/null @@ -1,10 +0,0 @@ -from bs4 import BeautifulSoup - -with open("html_files/卷001之1.html", "r", encoding="utf-8") as f: - soup = BeautifulSoup(f.read(), "html.parser") - -poem_div = soup.find("div", class_="poem") -if poem_div: - for i, p in enumerate(poem_div.find_all("p")[:20]): - print(f"--- P {i} ---") - print(p.text[:100].replace('\n', ' ')) diff --git a/佩文韵府/test_dump2.py b/佩文韵府/test_dump2.py deleted file mode 100644 index 92c5435..0000000 --- a/佩文韵府/test_dump2.py +++ /dev/null @@ -1,12 +0,0 @@ -import re -from bs4 import BeautifulSoup - -with open("html_files/卷001之1.html", "r", encoding="utf-8") as f: - soup = BeautifulSoup(f.read(), "html.parser") - -poem_div = soup.find("div", class_="poem") -if poem_div: - lines = poem_div.get_text().split("\n") - lines = [line.strip() for line in lines if line.strip()] - for i, line in enumerate(lines[:50]): - print(f"{i}: {line}") diff --git a/佩文韵府/test_empty_word.py b/佩文韵府/test_empty_word.py deleted file mode 100644 index 840b28c..0000000 --- a/佩文韵府/test_empty_word.py +++ /dev/null @@ -1,5 +0,0 @@ -import re -text = "對語〈渭北 江東〉〈 平北 安東〉摘句〈力障百川東〉" -tokens = re.findall(r"([^〈〉]*)((?:〈[^〉]+〉)+)", text) -for i, (word, desc_blocks) in enumerate(tokens): - print(f"Token {i}: WORD='{word}' DESCS={desc_blocks}") diff --git a/佩文韵府/test_heuristic.py b/佩文韵府/test_heuristic.py deleted file mode 100644 index d28c497..0000000 --- a/佩文韵府/test_heuristic.py +++ /dev/null @@ -1,54 +0,0 @@ -import json -import re - -with open('peiwenyunfu.json', 'r', encoding='utf-8') as f: - data = json.load(f) - -prefixes = ["韻藻", "增"] - -def replace_pipes(content, word): - clean_word = word - for p in prefixes: - if clean_word.startswith(p) and len(clean_word) > len(p): - clean_word = clean_word[len(p):] - break # only strip one prefix - - word_len = len(clean_word) - if word_len == 0: - return content.replace("丨", "") - - result = [] - pipe_idx = 0 - chars_since_last_pipe = 0 - - for char in content: - if char == "丨": - if chars_since_last_pipe >= 5: - # Long gap -> reset pipe_idx! - # Wait, only reset if we aren't in the middle of a perfect mapping? - # Actually, if the gap is >=5, it's definitely a new occurrence. - pipe_idx = 0 - - result.append(clean_word[pipe_idx % word_len]) - pipe_idx += 1 - chars_since_last_pipe = 0 - else: - result.append(char) - chars_since_last_pipe += 1 - - return "".join(result) - -# Test specific words -test_cases = [ - ("首陽東", "詩采葑采葑丨丨之丨"), - ("馬首東", "左傳欒黶曰吾丨丨欲丨乃歸下軍從之"), - ("澗瀍東", "書洛誥我乃卜丨水東丨水西惟洛食我又卜瀍水丨亦惟洛食"), - ("日夜東", "丨丨虞集詩絳桃風急丨丨丨王惲詩付與衡漳丨丨丨許有壬詩江水舟"), - ("東海東", "樓鑰詩萬里逺在丨丨丨張經詩崑崙之西丨"), -] - -for w, c in test_cases: - print(f"Word: {w}") - print(f"Orig: {c}") - print(f"Fix : {replace_pipes(c, w)}") - print("-" * 40) diff --git a/佩文韵府/test_heuristic2.py b/佩文韵府/test_heuristic2.py deleted file mode 100644 index 6216b65..0000000 --- a/佩文韵府/test_heuristic2.py +++ /dev/null @@ -1,37 +0,0 @@ -import json - -prefixes = ["韻藻", "增"] - -def replace_pipes_no_reset(content, word): - clean_word = word - for p in prefixes: - if clean_word.startswith(p) and len(clean_word) > len(p): - clean_word = clean_word[len(p):] - break - - word_len = len(clean_word) - if word_len == 0: - return content.replace("丨", "") - - result = [] - pipe_idx = 0 - for char in content: - if char == "丨": - result.append(clean_word[pipe_idx % word_len]) - pipe_idx += 1 - else: - result.append(char) - return "".join(result) - -test_cases = [ - ("首陽東", "詩采葑采葑丨丨之丨"), - ("馬首東", "左傳欒黶曰吾丨丨欲丨乃歸下軍從之"), - ("澗瀍東", "書洛誥我乃卜丨水東丨水西惟洛食我又卜瀍水丨亦惟洛食"), - ("日夜東", "丨丨虞集詩絳桃風急丨丨丨王惲詩付與衡漳丨丨丨許有壬詩江水舟"), - ("東海東", "樓鑰詩萬里逺在丨丨丨張經詩崑崙之西丨"), -] - -for w, c in test_cases: - print(f"Word: {w}") - print(f"NoRst: {replace_pipes_no_reset(c, w)}") - print("-" * 40) diff --git a/佩文韵府/test_hybrid.py b/佩文韵府/test_hybrid.py deleted file mode 100644 index cf57388..0000000 --- a/佩文韵府/test_hybrid.py +++ /dev/null @@ -1,56 +0,0 @@ -import json -import re - -prefixes = ["韻藻", "增"] - -def replace_pipes_hybrid(content, word): - clean_word = word - for p in prefixes: - if clean_word.startswith(p) and len(clean_word) > len(p): - clean_word = clean_word[len(p):] - break - - word_len = len(clean_word) - if word_len == 0: - return content.replace("丨", "") - - def repl(match): - nonlocal pipe_idx - block = match.group(0) - block_len = len(block) - - if block_len % word_len == 0: - # Full word match! Reset alignment. - pipe_idx = 0 - return clean_word * (block_len // word_len) - else: - # Partial word match. Use current sequence. - res = "" - for _ in range(block_len): - res += clean_word[pipe_idx % word_len] - pipe_idx += 1 - return res - - pipe_idx = 0 - return re.sub(r'丨+', repl, content) - -test_cases = [ - ("首陽東", "詩采葑采葑丨丨之丨"), - ("馬首東", "左傳欒黶曰吾丨丨欲丨乃歸下軍從之"), - ("澗瀍東", "書洛誥我乃卜丨水東丨水西惟洛食我又卜瀍水丨亦惟洛食"), - ("日夜東", "丨丨虞集詩絳桃風急丨丨丨王惲詩付與衡漳丨丨丨許有壬詩江水舟"), - ("東海東", "樓鑰詩萬里逺在丨丨丨張經詩崑崙之西丨"), -] - -for w, c in test_cases: - print(f"Word: {w}") - print(f"Orig: {c}") - print(f"Hybr: {replace_pipes_hybrid(c, w)}") - print("-" * 40) -test_cases.append(("紫殿東", "時魚躍丨丨丨温庭筠詩一夕丨丨")) - -for w, c in test_cases[-1:]: - print(f"Word: {w}") - print(f"Orig: {c}") - print(f"Hybr: {replace_pipes_hybrid(c, w)}") - print("-" * 40) diff --git a/佩文韵府/test_parser.py b/佩文韵府/test_parser.py deleted file mode 100644 index 73acff7..0000000 --- a/佩文韵府/test_parser.py +++ /dev/null @@ -1,31 +0,0 @@ -import json -from parser import parse_html - -def test_parse_html(): - file_path = "html_files/卷001之1.html" - result = parse_html(file_path) - - # Save for manual inspection - with open("output.json", "w", encoding="utf-8") as f: - json.dump(result, f, ensure_ascii=False, indent=2) - - # Check that it returns a dictionary - assert isinstance(result, dict) - - # Let's see what keys are in it - keys = list(result.keys()) - print("Keys found:", keys) - - if len(keys) > 0: - first_key = keys[0] - assert "卷" in result[first_key] - assert "声" in result[first_key] - assert "韵" in result[first_key] - assert "小韵描述" in result[first_key] - assert "词条" in result[first_key] - assert "对语" in result[first_key] - assert "摘句" in result[first_key] - -if __name__ == "__main__": - test_parse_html() - print("Tests passed!") diff --git a/佩文韵府/test_placeholder.py b/佩文韵府/test_placeholder.py deleted file mode 100644 index 7bf6f7a..0000000 --- a/佩文韵府/test_placeholder.py +++ /dev/null @@ -1,49 +0,0 @@ -import json -import re - -with open('peiwenyunfu.json', 'r', encoding='utf-8') as f: - data = json.load(f) - -# Analyze a few -count_match = 0 -count_mismatch = 0 - -for rhyme, r_data in list(data.items())[:5]: # Skip metadata, preface - if rhyme in ["metadata", "preface"]: - continue - print(f"\nRhyme: {rhyme}") - - # 1. 小韵描述 - desc = r_data.get("小韵描述", "") - desc_fixed = desc.replace("丨", rhyme) - print(f"Desc original: {desc[:30]}...") - print(f"Desc fixed: {desc_fixed[:30]}...") - - # 2. 词条 - for word, content in list(r_data.get("词条", {}).items())[:5]: - pipe_count = content.count("丨") - word_len = len(word) - if pipe_count == 0: - continue - - print(f"Word: {word} (len {word_len}), pipes: {pipe_count}") - print(f"Original: {content}") - - # Test replacing - if pipe_count % word_len == 0: - # We can replace them in groups of word_len - fixed_content = "" - pipe_idx = 0 - for char in content: - if char == "丨": - fixed_content += word[pipe_idx % word_len] - pipe_idx += 1 - else: - fixed_content += char - print(f"Fixed: {fixed_content}") - count_match += 1 - else: - print("MISMATCH length!") - count_mismatch += 1 - -print(f"\nMatches: {count_match}, Mismatches: {count_mismatch}") diff --git a/佩文韵府/test_playwright.py b/佩文韵府/test_playwright.py deleted file mode 100644 index a51fb7f..0000000 --- a/佩文韵府/test_playwright.py +++ /dev/null @@ -1,46 +0,0 @@ -import asyncio -from playwright.async_api import async_playwright -import urllib.parse -import sys - - -async def main(): - url = sys.argv[1] - proxy_server = "socks5://127.0.0.1:10808" - - async with async_playwright() as p: - # Try with proxy - browser = await p.chromium.launch(headless=True, proxy={"server": proxy_server}) - context = await browser.new_context( - user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" - ) - page = await context.new_page() - - try: - print(f"Loading {url} via proxy...") - await page.goto(url, timeout=30000, wait_until="domcontentloaded") - content = await page.content() - print(f"Success! Content length: {len(content)}") - except Exception as e: - print(f"Error via proxy: {e}") - await browser.close() - - # Try without proxy - print("Retrying without proxy...") - browser = await p.chromium.launch(headless=True) - context = await browser.new_context( - user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" - ) - page = await context.new_page() - try: - await page.goto(url, timeout=30000, wait_until="domcontentloaded") - content = await page.content() - print(f"Success! Content length: {len(content)}") - except Exception as e2: - print(f"Error without proxy: {e2}") - - await browser.close() - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/佩文韵府/test_prefixes.py b/佩文韵府/test_prefixes.py deleted file mode 100644 index 270ab0e..0000000 --- a/佩文韵府/test_prefixes.py +++ /dev/null @@ -1,21 +0,0 @@ -import json - -with open('peiwenyunfu.json', 'r', encoding='utf-8') as f: - data = json.load(f) - -mismatches = [] -prefixes_found = set() - -for rhyme, r_data in data.items(): - if rhyme in ["metadata", "preface"]: - continue - for word, content in r_data.get("词条", {}).items(): - pipe_count = content.count("丨") - if pipe_count == 0: - continue - if pipe_count % len(word) != 0: - mismatches.append((word, pipe_count)) - -print(f"Total mismatches: {len(mismatches)}") -for w, p in mismatches[:20]: - print(f"{w} (len {len(w)}), pipes: {p}") diff --git a/佩文韵府/test_specific.py b/佩文韵府/test_specific.py deleted file mode 100644 index 3033327..0000000 --- a/佩文韵府/test_specific.py +++ /dev/null @@ -1,13 +0,0 @@ -import json - -with open('peiwenyunfu.json', 'r', encoding='utf-8') as f: - data = json.load(f) - -for rhyme, r_data in data.items(): - if rhyme in ["metadata", "preface"]: - continue - for word, content in r_data.get("词条", {}).items(): - if word in ["紫殿東", "少微東", "東海東", "日夜東", "隔西東"]: - print(f"Word: {word}") - print(f"Content: {content}") - print("-" * 40) diff --git a/佩文韵府/test_split.py b/佩文韵府/test_split.py deleted file mode 100644 index 13ceec0..0000000 --- a/佩文韵府/test_split.py +++ /dev/null @@ -1,18 +0,0 @@ -import re -from bs4 import BeautifulSoup - -with open("html_files/卷001之1.html", "r", encoding="utf-8") as f: - soup = BeautifulSoup(f.read(), "html.parser") - -poem_div = soup.find("div", class_="poem") -text = poem_div.get_text() - -# Extract the list of characters. -# It appears after "一東韻一" or similar. -m = re.search(r"(上平聲|下平聲|上聲|去聲|入聲)\s+(.*?韻.*?)\n(.*?)\n", text) -if m: - print("Tone:", m.group(1)) - print("Rhyme:", m.group(2)) - print("Chars line:", m.group(3)) - rhyme_chars = [c for c in m.group(3).replace(' ', ' ').split() if len(c) == 1] - print("Chars:", rhyme_chars) diff --git a/佩文韵府/test_strip.py b/佩文韵府/test_strip.py deleted file mode 100644 index b14a491..0000000 --- a/佩文韵府/test_strip.py +++ /dev/null @@ -1,32 +0,0 @@ -import json -import re - -with open('peiwenyunfu.json', 'r', encoding='utf-8') as f: - data = json.load(f) - -prefixes = ["韻藻", "增"] - -mismatches = [] -total_pipes = 0 - -for rhyme, r_data in data.items(): - if rhyme in ["metadata", "preface"]: - continue - for word, content in r_data.get("词条", {}).items(): - clean_word = word - for p in prefixes: - if clean_word.startswith(p) and len(clean_word) > len(p): - clean_word = clean_word[len(p):] - - pipe_count = content.count("丨") - if pipe_count == 0: - continue - total_pipes += 1 - if pipe_count % len(clean_word) != 0: - mismatches.append((word, clean_word, pipe_count, content)) - -print(f"Total entries with pipes: {total_pipes}") -print(f"Total mismatches after stripping: {len(mismatches)}") -for w, cw, p, c in mismatches[:10]: - print(f"{w} -> {cw} (len {len(cw)}), pipes: {p}") - print(f" {c}") diff --git a/佩文韵府/test_tokenize.py b/佩文韵府/test_tokenize.py deleted file mode 100644 index 3b86475..0000000 --- a/佩文韵府/test_tokenize.py +++ /dev/null @@ -1,21 +0,0 @@ -import re -from bs4 import BeautifulSoup - -with open("html_files/卷001之1.html", "r", encoding="utf-8") as f: - soup = BeautifulSoup(f.read(), "html.parser") - -text = soup.find("div", class_="poem").get_text().replace('\n', '') - -# Remove header junk for this test (just find the first '東〈') -start_idx = text.find('東〈') -text = text[start_idx:] - -# Tokenize into pairs of (Word, Description) -# Using regex to find all Word〈Description〉 -# Wait, multiple 〈 〉 can follow a word like 對語〈...〉〈...〉 -# We can find all chunks of non-〈 characters, followed by one or more 〈...〉 -tokens = re.findall(r"([^〈〉]*)((?:〈[^〉]+〉)+)", text) - -for i, (word, desc_blocks) in enumerate(tokens[:20]): - print(f"Token {i}: WORD='{word}' DESCS={desc_blocks[:30]}...") - diff --git a/分类.md b/分类.md index e69de29..18aa389 100644 --- a/分类.md +++ b/分类.md @@ -0,0 +1,93 @@ +# 类书分类与特点梳理 + +为辅助专业诗歌创作小程序提供“知识库”,整理现有类书数据的优缺点、特点及在辅助创作中的核心价值如下: + +## 一、词条式 +这类类书通常以主题(如天文地理、自然物候等)分类,适合根据意象(如“雪”、“月”)进行正向检索找语料。 + +- **《白孔六帖》** + - **内容与编排顺序**:按天文地理、历法礼仪、生活物件等类目编排。 + - **特点**:采辑各种典籍中的成语、典故、短语,多为四字短句,并附带简短的释义。 + - **整理问题**:提取很不干净,正文与释义括号混合在一起且缺乏断句。 + - **辅助创作价值**:可作为提炼“四字骈语”或冷僻典故的原始素材库,但需大量NLP清洗。 + - **典型例子**:`"content": "白髙眀柔克(髙明天也柔克寒暑不干)隂隲下民(言天黙定下民之命)天尊(地卑)..."` + +- **《北堂书钞》** + - **内容与编排顺序**:按帝王、后妃、政术、刑法、官职、礼仪等社会制度与名物编排。 + - **特点**:成书于隋,引用起自三代、汉、魏,迄于宋、齐。侧重于对概念的追本溯源,对词条解释较为详细。 + - **整理问题**:JSON提取的正文部分缺乏标点,长句粘连。 + - **辅助创作价值**:古朴凝练。如创作者需要引用较为正统的经史概念(如写咏史诗),该书能提供最原汁原味的早期语料。 + - **典型例子**:`"content": "皇者天人之總稱 帝者天號 正氣爲帝 帝者天下之所適王者天下之所徃也..."` + +- **《初学记》** + - **内容与编排顺序**:按天、岁时、官职、地理等编排。每个词条下细分为“叙事”、“事对”、“诗文”。 + - **特点**:词条内容非常详细、层次分明,具有极强的结构化特征。 + - **整理问题**:整理得非常干净,JSON层级保留了原始分类结构。 + - **辅助创作价值**:**价值极高**。“事对”直接提供了现成的对仗词汇(写诗利器);“诗文”栏目则方便查阅前人咏此物的范本。 + - **典型例子**:`"事对": "轉葢 倚杵(桓譚新論天如葢轉左旋...) 覆盆 轉轂(王充論衡曰...)"` + +- **《海录碎事》** + - **内容与编排顺序**:按非常细碎的关键词(天、地、衣冠等)分类。 + - **特点**:每一类下词条过细(常为生僻两字词),每一词条下内容极少,通常只有一两句包含该词的引文。 + - **整理问题**:词条过度碎片化,键名就是细碎词汇。 + - **辅助创作价值**:相当于一个“逆向用词示例库”。诗人想用某个生僻意象时,用它查看古人如何将其嵌入诗句中。 + - **典型例子**:`"曽穹": [{"content": "蹀足循廣除瞬目矖曽穹(文選謝惠連詩)"}]` + +- **《骈字类编》** + - **内容与编排顺序**:按天地、时令、山水、珍宝、器物等词汇大类编排。 + - **特点**:专收“骈语”(双音节词),词条极多,详细列出了该词在各路经史子集中出现的位置。 + - **整理问题**:长段引文粘连,缺少现代标点。 + - **辅助创作价值**:**价值极高**。古诗词创作最核心的就是对“双字词汇”的拿捏,此书就是一个庞大且天然的古典双字词语境库。 + - **典型例子**:`"天地": "易干夫大人者与天地合其德 又坤天地变化草木蕃天地闭贤人隐..."` + +- **《太平御览》** + - **内容与编排顺序**:以天、地、人、事、物为大类顺序。 + - **特点**:在前代《修文殿御览》《艺文类聚》等书基础上编纂而成,包罗万象,词条内容全部为原文引文。 + - **整理问题**:带有原书排版格式(换行、`《书名》曰`),阅读体验较佳。 + - **辅助创作价值**:提供最详实的事物背景知识,适合在需要了解某个意象(如“雪”)的全面历史文化背景时使用。 + - **典型例子**:`"《三五曆記》曰:未有天地之時,混沌狀如雞子,溟涬始牙..."` + +- **《艺文类聚》** + - **内容与编排顺序**:按天、岁时、地理、帝王、人、乐、职官等编排。 + - **特点**:事文交织。词条收录较广,既有经史中的“叙事”,也有大量的历代【诗】、【赋】、【赞】。 + - **整理问题**:带有原书的标点和分段标记,格式清晰。 + - **辅助创作价值**:极好的文学创作资料库,帮助创作者一站式看到某个主题在古代诗文中的各种形态。 + - **典型例子**:`"【詩】晉傅玄《兩儀詩》曰:兩儀始分.元氣上清.列宿垂象.六位時成..."` + +- **《玉海》** + - **内容与编排顺序**:详分天文、地理、典章制度等。 + - **特点**:词条常为长篇大论,注重典章制度、天文地理的详细考证。 + - **整理问题**:存在词条名拆分或提取不精准的问题(如把“中宫二十八舍”拆断处理)。正文缺乏标点。 + - **辅助创作价值**:提供精准宏大的制度与天文星象知识,适合创作偏严肃或庙堂题材的诗歌。 + - **典型例子**:`"中宫": "漢天文志(史天官書同)中宫天極星其一明者泰一之常居也旁三星三公..."` + +- **《渊鉴类函》** + - **内容与编排顺序**:按大部类排布。 + - **特点**:清代集大成之作,将引文明确区分为“原”(原类书已有)与“增”(清代新增)。 + - **整理问题**:使用空格作为句读分隔,未见全角标点。 + - **辅助创作价值**:覆盖面最广的兜底宝库,适合查阅各种意象的演变和最全面的引文集合。 + - **典型例子**:`"原釋名曰天坦也坦然髙而逺也 增又曰天顯也在上髙顯也..."` + +## 二、韵式 +这类类书专为押韵而生,以“韵母”或“韵字”为一级分类,适合在写格律诗卡壳、需要找特定韵脚词汇时使用。 + +- **《佩文韵府》** + - **内容与编排顺序**:按平水韵分类(如“一东”),下系以该字为尾的各种词条及摘句。 + - **特点**:非常详细,包含声调、韵部说明,以及海量的带出处短句。以元代《韵府群玉》和明代《五车韵瑞》为基础增补。 + - **整理问题**:JSON结构层次非常清晰。使用“丨”符号代替原韵字(如“東”被替换为“丨”)。 + - **辅助创作价值**:**写诗必备神器**。想用“东”韵时,能瞬间获得大量以东结尾的词汇(如“南东”、“活东”)及例句,极大辅助押韵。 + - **典型例子**:`"(韵母)东": { "小韵描述": "东德红切眷方也...", "词条": { "活東": "爾雅科斗丨丨蝦蟇也...", "牆東": "後漢書避世丨丨王君公..." } }` + +- **《韵府群玉》** + - **内容与编排顺序**:按大韵分类,列出小韵和具体词条。 + - **特点**:早期的韵书,条目较为简练紧凑。 + - **整理问题**:条目内容被尖括号`〈〉`包裹,夹杂部分注音(如“徳紅切”)。 + - **辅助创作价值**:与佩文韵府同理,但体量更小,适合快速查阅核心的传统押韵典故。 + - **典型例子**:`"東": { "道東": "〈漢鄭𤣥事馬融辭歸融曰吾道東矣本〉" }` + +- **《五车韵瑞》** + - **内容与编排顺序**:/ + - **特点**:/ + - **整理问题**:**严重问题**,当前文件夹内的 `allorigins.json` 数据获取失败,内容实际上是 Nginx 的 `500 Internal Server Error` 报错网页代码,并非JSON数据。 + - **辅助创作价值**:暂时无价值。需要修复爬虫和数据源。 + - **典型例子**:`500 Internal Server Error...` \ No newline at end of file diff --git a/初学记/generate_json.py b/初学记/generate_json.py deleted file mode 100644 index f010f29..0000000 --- a/初学记/generate_json.py +++ /dev/null @@ -1,161 +0,0 @@ -import bs4 -import os -import re -import json - -html_dir = "epub_extracted/OPS" - -def parse_html(filepath): - with open(filepath, "r", encoding="utf-8") as f: - soup = bs4.BeautifulSoup(f.read(), "html.parser") - - poem_divs = soup.find_all("div", class_="poem") - events = [] - - for div in poem_divs: - for p in div.find_all("p"): - for child in p.children: - if child.name == "br": - pass # ignore br, we want continuous text - elif child.name == "span" and child.get("id") and child.text.strip().endswith("部"): - events.append(("section", child.text.strip())) - elif child.name == "small": - small_text = child.get_text() - if not small_text.startswith("〈") and not small_text.startswith("("): - events.append(("text", f"({small_text})")) - else: - events.append(("text", small_text)) - elif isinstance(child, str): - events.append(("text", child)) - else: - events.append(("text", child.get_text())) - - return events - - -def extract_sections(content_text): - result = {"叙事": "", "事对": "", "诗文": ""} - - rest_after_narrative = content_text - - shìduì_start = rest_after_narrative.find("事對") - if shìduì_start == -1: - shìduì_start = rest_after_narrative.find("事对") - - genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛|讃|啟|辭|辞|操|對|对)" - - if shìduì_start != -1: - result["叙事"] = rest_after_narrative[:shìduì_start] - rest_after_shiduì = rest_after_narrative[shìduì_start + 2 :] - - shiwen_match = re.search(genre_pattern, rest_after_shiduì) - - if shiwen_match: - split_idx = shiwen_match.start() + 1 - result["事对"] = rest_after_shiduì[:split_idx] - result["诗文"] = rest_after_shiduì[split_idx:] - else: - result["事对"] = rest_after_shiduì - else: - shiwen_match = re.search(genre_pattern, rest_after_narrative) - if shiwen_match: - split_idx = shiwen_match.start() + 1 - result["叙事"] = rest_after_narrative[:split_idx] - result["诗文"] = rest_after_narrative[split_idx:] - else: - result["叙事"] = rest_after_narrative - - for k in result: - result[k] = result[k].replace("〈", "(").replace("〉", ")").strip() - - return result - -def main(): - categories = {} - total_entries_found = 0 - - for vol in range(1, 31): - filename = None - for fn in os.listdir(html_dir): - if fn.endswith(f"juan{vol:02d}.xhtml"): - filename = fn - break - - if not filename: - print(f"Volume {vol} not found") - continue - - events = parse_html(os.path.join(html_dir, filename)) - - merged = [] - current_section = "" - current_text = [] - - for ev_type, val in events: - if ev_type == "section": - if current_text: - merged.append((current_section, "".join(current_text))) - current_text = [] - current_section = val - else: - current_text.append(val) - - if current_text: - merged.append((current_section, "".join(current_text))) - - for sec, txt in merged: - matches = list(re.finditer(r"(?:^|[〉)\u3000\s])([^〈〉<>()\s]+?)(?:第[一二三四五六七八九十百]+(?:[上下])?)?〈(?:叙事|敘事)〉", txt)) - - if not matches: - continue - - for i, m in enumerate(matches): - entry_name = m.group(1).strip() - start_idx = m.end() # Start of the content AFTER 〈叙事〉 - # Find the actual start index of the next match - # m.start(1) skips the optional prefix character! - # Wait! matches[i+1].start() gives the start of the whole match, including the optional prefix. - # If we use matches[i+1].start(), we might include the `〉` from the END of this entry inside the next entry's ignored space. - # Actually, `matches[i+1].start()` includes the boundary char `〉`. - # We should stop at `matches[i+1].start(1) - len(prefix)`? - # Actually, if the next entry starts with `〉`, that `〉` BELONGS to the CURRENT entry's poetry! - # E.g. `...終負昔賢心〉虹蜺第七〈敘事〉` - # The `〉` is the closing bracket for `蘇味道詠虹詩〈...終負昔賢心`. - # So we MUST include it! - # `matches[i+1].start(1)` points EXACTLY to `虹`, so `txt[start_idx:matches[i+1].start(1)]` WILL INCLUDE `〉` at the end! - # This is EXACTLY what we want! - end_idx = matches[i+1].start(1) if i + 1 < len(matches) else len(txt) - - content_text = txt[start_idx:end_idx] - - sections = extract_sections(content_text) - - if entry_name not in categories: - categories[entry_name] = [] - - categories[entry_name].append({ - "volume": vol, - "section": sec, - "content": sections - }) - total_entries_found += 1 - - final_json = { - "metadata": { - "title": "初学记", - "author": "徐坚", - "dynasty": "唐", - "total_volumes": 30, - "source": "2026年1月28日从维基文库导出" - }, - "preface": "", - "categories": categories - } - - with open("初学记.json", "w", encoding="utf-8") as f: - json.dump(final_json, f, ensure_ascii=False, indent=2) - - print(f"Generated 初学记.json with {len(categories)} unique categories and {total_entries_found} total entries.") - -if __name__ == "__main__": - main() diff --git a/初学记/test_extract.py b/初学记/test_extract.py deleted file mode 100644 index b4b2da2..0000000 --- a/初学记/test_extract.py +++ /dev/null @@ -1,24 +0,0 @@ -import bs4 -import re -import sys - -def parse_html(filepath): - with open(filepath, "r", encoding="utf-8") as f: - soup = bs4.BeautifulSoup(f.read(), "html.parser") - - current_section = None - - poem_divs = soup.find_all("div", class_="poem") - for div in poem_divs: - # Before we process children, let's look at p tags - for p in div.find_all("p"): - for child in p.children: - if child.name == "span" and child.get("id") and child.text.strip().endswith("部"): - current_section = child.text.strip() - print("Found Section:", current_section) - elif child.name == "br": - pass - elif type(child) == bs4.element.NavigableString: - pass - -parse_html("epub_extracted/OPS/c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml") diff --git a/初学记/test_extract2.py b/初学记/test_extract2.py deleted file mode 100644 index b0a7f77..0000000 --- a/初学记/test_extract2.py +++ /dev/null @@ -1,54 +0,0 @@ -import bs4 -import re -import os - -html_dir = "epub_extracted/OPS" - - -def parse_html(filepath): - with open(filepath, "r", encoding="utf-8") as f: - soup = bs4.BeautifulSoup(f.read(), "html.parser") - - poem_divs = soup.find_all("div", class_="poem") - - events = [] - - for div in poem_divs: - for p in div.find_all("p"): - current_line = [] - for child in p.children: - if child.name == "br": - if current_line: - events.append(("line", "".join(current_line).strip())) - current_line = [] - elif ( - child.name == "span" - and child.get("id") - and child.text.strip().endswith("部") - ): - events.append(("section", child.text.strip())) - elif child.name == "small": - # Convert small text to brackets - small_text = child.get_text() - # It might already have brackets inside. - if not small_text.startswith("〈") and not small_text.startswith( - "(" - ): - current_line.append(f"({small_text})") - else: - # Sometimes it has pseudo-brackets like `叙事...` - # Let's clean the text up. We should just take the clean text and if it doesn't have brackets naturally (or we just process it to be clean). - # Let's look at child.get_text() - current_line.append(small_text) - else: - current_line.append(child.get_text()) - if current_line: - events.append(("line", "".join(current_line).strip())) - return events - - -events = parse_html( - os.path.join(html_dir, "c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml") -) -for e in events[:30]: - print(e) diff --git a/初学记/test_extract3.py b/初学记/test_extract3.py deleted file mode 100644 index 141820d..0000000 --- a/初学记/test_extract3.py +++ /dev/null @@ -1,127 +0,0 @@ -import bs4 -import os -import re - -html_dir = "epub_extracted/OPS" - - -def parse_html(filepath): - with open(filepath, "r", encoding="utf-8") as f: - soup = bs4.BeautifulSoup(f.read(), "html.parser") - - poem_divs = soup.find_all("div", class_="poem") - - events = [] - for div in poem_divs: - for p in div.find_all("p"): - current_line = [] - for child in p.children: - if child.name == "br": - if current_line: - events.append(("line", "".join(current_line).strip())) - current_line = [] - elif ( - child.name == "span" - and child.get("id") - and child.text.strip().endswith("部") - ): - events.append(("section", child.text.strip())) - elif child.name == "small": - small_text = child.get_text() - if not small_text.startswith("〈") and not small_text.startswith( - "(" - ): - current_line.append(f"〈{small_text}〉") - else: - current_line.append(small_text) - else: - current_line.append(child.get_text()) - if current_line: - events.append(("line", "".join(current_line).strip())) - return events - - -def extract_sections(entry_text): - result = {"叙事": "", "事对": "", "诗文": ""} - - narrative_match = re.search(r"〈(叙事|敘事)〉(.*)", entry_text) - if not narrative_match: - return result - - rest_after_narrative = narrative_match.group(2) - - shìduì_start = rest_after_narrative.find("事對") - if shìduì_start == -1: - shìduì_start = rest_after_narrative.find("事对") - - if shìduì_start != -1: - result["叙事"] = rest_after_narrative[:shìduì_start] - rest_after_shiduì = rest_after_narrative[shìduì_start + 2 :] - - genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛|讃|啟)" - shiwen_match = re.search(genre_pattern, rest_after_shiduì) - - if shiwen_match: - split_idx = shiwen_match.start() + 1 - result["事对"] = rest_after_shiduì[:split_idx] - result["诗文"] = rest_after_shiduì[split_idx:] - else: - result["事对"] = rest_after_shiduì - else: - genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛|讃|啟)" - shiwen_match = re.search(genre_pattern, rest_after_narrative) - if shiwen_match: - split_idx = shiwen_match.start() + 1 - result["叙事"] = rest_after_narrative[:split_idx] - result["诗文"] = rest_after_narrative[split_idx:] - else: - result["叙事"] = rest_after_narrative - - for k in result: - result[k] = result[k].replace("〈", "(").replace("〉", ")") - - return result - - -events = parse_html( - os.path.join(html_dir, "c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml") -) - -entries = [] -current_entry_text = [] - -for ev_type, text in events: - if ev_type == "section": - pass - else: - if "〈叙事〉" in text or "〈敘事〉" in text: - if current_entry_text: - full_text = "".join(current_entry_text) - match = re.search( - r"^(.*?)(?:第[一二三四五六七八九十百]+(?:[上下])?)?〈(?:叙事|敘事)〉", - full_text, - ) - if match: - entry_name = match.group(1).strip() - sections = extract_sections(full_text) - entries.append((entry_name, sections)) - current_entry_text = [text] - else: - current_entry_text.append(text) - -if current_entry_text: - full_text = "".join(current_entry_text) - match = re.search( - r"^(.*?)(?:第[一二三四五六七八九十百]+(?:[上下])?)?〈(?:叙事|敘事)〉", full_text - ) - if match: - entry_name = match.group(1).strip() - sections = extract_sections(full_text) - entries.append((entry_name, sections)) - -for entry_name, sections in entries[:3]: - print("Entry:", entry_name) - print("叙事:", sections["叙事"][:50]) - print("事对:", sections["事对"][:50]) - print("诗文:", sections["诗文"][:50]) - print("-" * 20) diff --git a/初学记/test_parse.py b/初学记/test_parse.py deleted file mode 100644 index d4f350f..0000000 --- a/初学记/test_parse.py +++ /dev/null @@ -1,35 +0,0 @@ -import bs4 -import os -import json -import re - -html_dir = "epub_extracted/OPS" - - -def parse_html(filepath): - with open(filepath, "r", encoding="utf-8") as f: - soup = bs4.BeautifulSoup(f.read(), "html.parser") - - poem_divs = soup.find_all("div", class_="poem") - texts = [] - for div in poem_divs: - for p in div.find_all("p"): - # We want to process the text inside

while respecting
as separators. - # But actually, inside a

, there are text nodes, , ,
, etc. - current_line = [] - for child in p.children: - if child.name == "br": - texts.append("".join(current_line).strip()) - current_line = [] - else: - current_line.append(child.get_text()) - if current_line: - texts.append("".join(current_line).strip()) - return texts - - -texts = parse_html( - os.path.join(html_dir, "c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml") -) -for i, t in enumerate(texts[:50]): - print(f"{i}: {t}") diff --git a/初学记/test_parse2.py b/初学记/test_parse2.py deleted file mode 100644 index 3642144..0000000 --- a/初学记/test_parse2.py +++ /dev/null @@ -1,93 +0,0 @@ -import bs4 -import os -import json -import re - -html_dir = "epub_extracted/OPS" - - -def parse_html(filepath): - with open(filepath, "r", encoding="utf-8") as f: - soup = bs4.BeautifulSoup(f.read(), "html.parser") - - poem_divs = soup.find_all("div", class_="poem") - texts = [] - for div in poem_divs: - for p in div.find_all("p"): - current_line = [] - for child in p.children: - if child.name == "br": - texts.append("".join(current_line).strip()) - current_line = [] - else: - current_line.append(child.get_text()) - if current_line: - texts.append("".join(current_line).strip()) - return texts - - -def extract_sections(entry_text): - result = {"叙事": "", "事对": "", "诗文": ""} - - # Extract 叙事 - narrative_match = re.search(r"〈叙事〉(.*)", entry_text) - if not narrative_match: - return result - - rest_after_narrative = narrative_match.group(1) - - # Find 事对 - shìduì_start = rest_after_narrative.find("事對") - if shìduì_start == -1: - shìduì_start = rest_after_narrative.find("事对") - - if shìduì_start != -1: - result["叙事"] = rest_after_narrative[:shìduì_start] - rest_after_shiduì = rest_after_narrative[shìduì_start + 2 :] # skip "事對" - - # Find 诗文 start - # Match 〉 followed by a literary genre - genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛)" - shiwen_match = re.search(genre_pattern, rest_after_shiduì) - - if shiwen_match: - split_idx = shiwen_match.start() + 1 # keep the genre character - result["事对"] = rest_after_shiduì[:split_idx] - result["诗文"] = rest_after_shiduì[split_idx:] - else: - result["事对"] = rest_after_shiduì - else: - # No 事对 - genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛)" - shiwen_match = re.search(genre_pattern, rest_after_narrative) - if shiwen_match: - split_idx = shiwen_match.start() + 1 - result["叙事"] = rest_after_narrative[:split_idx] - result["诗文"] = rest_after_narrative[split_idx:] - else: - result["叙事"] = rest_after_narrative - - return result - - -texts = parse_html( - os.path.join(html_dir, "c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml") -) -entries = [] -for line in texts: - if "〈叙事〉" in line: - entry_name = line.split("〈叙事〉")[0] - # remove "第X" from entry_name - word_name = re.sub( - r"第[一二三四五六七八九十百]+(?:[上下])?$", "", entry_name - ).strip() - sections = extract_sections(line) - entries.append((word_name, entry_name, sections)) - -for e in entries[:3]: - print(f"Word: {e[0]}") - print(f"Entry: {e[1]}") - print(f"叙事 len: {len(e[2]['叙事'])}") - print(f"事对 len: {len(e[2]['事对'])}") - print(f"诗文 len: {len(e[2]['诗文'])}") - print("-" * 20) diff --git a/初学记/test_split.py b/初学记/test_split.py deleted file mode 100644 index d7f1465..0000000 --- a/初学记/test_split.py +++ /dev/null @@ -1,41 +0,0 @@ -import re - -def extract_sections(content_text): - result = {"叙事": "", "事对": "", "诗文": ""} - - rest_after_narrative = content_text - - shìduì_start = rest_after_narrative.find("事對") - if shìduì_start == -1: - shìduì_start = rest_after_narrative.find("事对") - - genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛|讃|啟|辭|辞|操|對|对|銘|論)" - - if shìduì_start != -1: - result["叙事"] = rest_after_narrative[:shìduì_start] - rest_after_shiduì = rest_after_narrative[shìduì_start + 2 :] - - shiwen_match = re.search(genre_pattern, rest_after_shiduì) - - if shiwen_match: - split_idx = shiwen_match.start() + 1 - result["事对"] = rest_after_shiduì[:split_idx] - result["诗文"] = rest_after_shiduì[split_idx:] - else: - result["事对"] = rest_after_shiduì - else: - shiwen_match = re.search(genre_pattern, rest_after_narrative) - if shiwen_match: - split_idx = shiwen_match.start() + 1 - result["叙事"] = rest_after_narrative[:split_idx] - result["诗文"] = rest_after_narrative[split_idx:] - else: - result["叙事"] = rest_after_narrative - - for k in result: - result[k] = result[k].replace("〈", "(").replace("〉", ")") - - return result - -import json -print(extract_sections("這里是叙事事對這裡是事對〉詩這里是詩文"))