Update: 删除垃圾程序

2026-03-22 16:43:10 +08:00
parent 183b842090
commit 2881163106
41 changed files with 93 additions and 2046 deletions
--- a/佩文韵府/check_siai.py
+++ b/佩文韵府/check_siai.py
@@ -1,8 +0,0 @@
-import json
-
-with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
-    data = json.load(f)
-
-siai_citiao = data.get("𩅰", {}).get("词条", {})
-for k, v in list(siai_citiao.items())[:20]:
-    print(f"{k}: {v[:30]}...")
--- a/佩文韵府/check_v2.py
+++ b/佩文韵府/check_v2.py
@@ -1,7 +0,0 @@
-import json
-with open('peiwenyunfu_v2.json', 'r', encoding='utf-8') as f:
-    data = json.load(f)
-
-for k in ['𩅰', '桵', '离', '梩']:
-    print(f"Rhyme: {k}")
-    print(json.dumps(data.get(k, {}), ensure_ascii=False, indent=2))
--- a/佩文韵府/download_sequentially.py
+++ b/佩文韵府/download_sequentially.py
@@ -1,71 +0,0 @@
-import os
-import time
-import urllib.request
-import urllib.parse
-from urllib.error import HTTPError
-
-def main():
-    if not os.path.exists("html_files"):
-        os.makedirs("html_files")
-
-    with open("still_missing.txt", "r", encoding="utf-8") as f:
-        urls = [line.strip() for line in f if line.strip()]
-
-    proxy_support = urllib.request.ProxyHandler({'http': 'http://127.0.0.1:10808', 'https': 'http://127.0.0.1:10808'})
-    opener = urllib.request.build_opener(proxy_support)
-    urllib.request.install_opener(opener)
-
-    headers = {
-        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
-        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
-        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
-        'Referer': 'https://zh.wikisource.org/'
-    }
-
-    count = 0
-    total = len(urls)
-    
-    for url in urls:
-        vol = urllib.parse.unquote(url).split('/')[-1]
-        filename = f'html_files/{vol}.html'
-        
-        if os.path.exists(filename):
-            print(f"[{count+1}/{total}] Skipping {vol} (exists)")
-            count += 1
-            continue
-            
-        print(f"[{count+1}/{total}] Downloading {vol}...")
-        
-        success = False
-        for attempt in range(5):
-            try:
-                req = urllib.request.Request(url, headers=headers)
-                response = urllib.request.urlopen(req, timeout=15)
-                html = response.read().decode('utf-8')
-                
-                with open(filename, 'w', encoding='utf-8') as out_f:
-                    out_f.write(html)
-                    
-                print(f"  -> Success!")
-                success = True
-                break
-                
-            except HTTPError as e:
-                if e.code == 403:
-                    print(f"  -> 403 Forbidden. Waiting 5 seconds...")
-                    time.sleep(5)
-                else:
-                    print(f"  -> HTTP Error {e.code}")
-            except Exception as e:
-                print(f"  -> Error: {e}")
-                
-            time.sleep(2)
-            
-        if not success:
-            print(f"  -> Failed all attempts.")
-            
-        count += 1
-        time.sleep(1) # Be nice to server
-
-if __name__ == "__main__":
-    main()
--- a/佩文韵府/extract_links.py
+++ b/佩文韵府/extract_links.py
@@ -1,110 +0,0 @@
-import urllib.request
-from bs4 import BeautifulSoup
-import urllib.parse
-import re
-import os
-
-url = "https://api.allorigins.win/raw?url=https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)"
-
-def chinese_to_arabic(cn_str):
-    cn_num = {
-        "零": 0, "〇": 0, "一": 1, "二": 2, "三": 3, "四": 4,
-        "五": 5, "六": 6, "七": 7, "八": 8, "九": 9, "十": 10,
-        "百": 100, "千": 1000,
-    }
-    result = 0
-    temp = 0
-    for char in cn_str:
-        if char in ["百", "千"]:
-            if temp == 0:
-                temp = 1
-            result += temp * cn_num[char]
-            temp = 0
-        elif char == "十":
-            if temp == 0:
-                temp = 1
-            if len(cn_str) == 1:
-                return 10
-            elif result == 0 and temp == 1 and cn_str[0] == "十":
-                result += 10
-                temp = 0
-            else:
-                result += temp * cn_num[char]
-                temp = 0
-        else:
-            temp = cn_num.get(char, 0)
-    result += temp
-    return result
-
-def get_filename(vol_str):
-    m = re.match(r"卷(.+?)之(.+)", vol_str)
-    if m:
-        v1 = chinese_to_arabic(m.group(1))
-        v2 = chinese_to_arabic(m.group(2))
-        return f"卷{v1:03d}之{v2}.html"
-    m = re.match(r"卷(.+)", vol_str)
-    if m:
-        v1 = chinese_to_arabic(m.group(1))
-        return f"卷{v1:03d}.html"
-    return vol_str + ".html"
-
-def main():
-    try:
-        req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
-        with urllib.request.urlopen(req, timeout=30) as response:
-            html = response.read().decode("utf-8")
-    except Exception as e:
-        print("Failed to fetch:", e)
-        # To avoid failing the test if the network is down but it mocks something else
-        # we can just pass here, but normally a mocked urllib would work.
-        pass
-
-    try:
-        soup = BeautifulSoup(html, "html.parser")
-    except NameError:
-        return
-
-    html_dir = "/mnt/fast/private/denglifan/workspace/spider-ctext/佩文韵府/html_files/"
-    existing_files = set(os.listdir(html_dir)) if os.path.exists(html_dir) else set()
-    
-    missing_urls = []
-    seen_urls = set()
-    base_url = "https://zh.wikisource.org"
-
-    for a in soup.find_all("a"):
-        href = a.get("href")
-        if not href:
-            continue
-        unquoted_href = urllib.parse.unquote(href)
-        
-        # Only include `卷XXX之Y` links (ignore 全览 files)
-        if "御定佩文韻府" in unquoted_href and "四庫全書本" in unquoted_href and "/卷" in unquoted_href:
-            title_part = unquoted_href.split("/")[-1]
-            
-            if "全覽" in title_part or "全览" in title_part:
-                continue
-                
-            # Filter for `卷XXX之Y` pattern if strictly needed.
-            # But let's check regex pattern "卷.+?之.+"
-            if not re.match(r"卷.+?之.+", title_part):
-                continue
-                
-            full_url = urllib.parse.urljoin(base_url, href)
-            full_url = full_url.split('#')[0]
-            
-            if full_url in seen_urls:
-                continue
-            seen_urls.add(full_url)
-            
-            filename = get_filename(title_part)
-            if filename not in existing_files:
-                missing_urls.append(full_url)
-
-    with open("missing_urls.txt", "w", encoding="utf-8") as f:
-        for u in missing_urls:
-            f.write(u + "\n")
-
-    print(f"Found {len(missing_urls)} missing URLs.")
-
-if __name__ == "__main__":
-    main()
--- a/佩文韵府/fix_final.py
+++ b/佩文韵府/fix_final.py
@@ -1,14 +0,0 @@
-import json
-
-with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
-    data = json.load(f)
-
-for rhyme, r_data in data.items():
-    if rhyme in ["metadata", "preface"]: continue
-    if "词条" in r_data:
-        for word, content in r_data["词条"].items():
-            if "丨" in content:
-                r_data["词条"][word] = content.replace("丨", rhyme)
-
-with open('peiwenyunfu.json', 'w', encoding='utf-8') as f:
-    json.dump(data, f, ensure_ascii=False, indent=4)
--- a/佩文韵府/fix_pipes.py
+++ b/佩文韵府/fix_pipes.py
@@ -1,69 +0,0 @@
-import json
-import re
-
-print("Loading peiwenyunfu.json...")
-with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
-    data = json.load(f)
-
-prefixes = ["韻藻", "增", "増"]
-
-def clean_headword(word):
-    clean_word = word
-    # Try stripping prefixes
-    for _ in range(2): # In case there's "増韻藻" or something
-        for p in prefixes:
-            if clean_word.startswith(p) and len(clean_word) > len(p):
-                clean_word = clean_word[len(p):]
-    return clean_word
-
-def replace_pipes_in_content(content, word):
-    clean_word = clean_headword(word)
-    word_len = len(clean_word)
-    
-    if word_len == 0 or "丨" not in content:
-        return content
-
-    def repl(match):
-        nonlocal pipe_idx
-        block = match.group(0)
-        block_len = len(block)
-        
-        if block_len % word_len == 0:
-            # Full word match! Reset alignment.
-            pipe_idx = 0
-            return clean_word * (block_len // word_len)
-        else:
-            # Partial word match. Use current sequence.
-            res = ""
-            for _ in range(block_len):
-                res += clean_word[pipe_idx % word_len]
-                pipe_idx += 1
-            return res
-
-    pipe_idx = 0
-    return re.sub(r'丨+', repl, content)
-
-print("Processing...")
-for rhyme, r_data in data.items():
-    if rhyme in ["metadata", "preface"]:
-        continue
-    
-    # 1. Fix 小韵描述
-    if "小韵描述" in r_data and r_data["小韵描述"]:
-        # The placeholder should be replaced by the rhyme char
-        # BUT wait! The rhyme char might be simplified in our dictionary keys!
-        # The user's prompt used "东" for replacement in 小韵描述. 
-        # So we just use the dictionary key `rhyme`.
-        r_data["小韵描述"] = r_data["小韵描述"].replace("丨", rhyme)
-        
-    # 2. Fix 词条
-    if "词条" in r_data:
-        new_citiao = {}
-        for word, content in r_data["词条"].items():
-            new_citiao[word] = replace_pipes_in_content(content, word)
-        r_data["词条"] = new_citiao
-
-print("Saving peiwenyunfu.json...")
-with open('peiwenyunfu.json', 'w', encoding='utf-8') as f:
-    json.dump(data, f, ensure_ascii=False, indent=4)
-print("Done!")
--- a/佩文韵府/fix_remaining.py
+++ b/佩文韵府/fix_remaining.py
@@ -1,24 +0,0 @@
-import json
-
-print("Loading...")
-with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
-    data = json.load(f)
-
-for rhyme, r_data in data.items():
-    if rhyme in ["metadata", "preface"]: continue
-    
-    if "对语" in r_data and r_data["对语"]:
-        r_data["对语"] = r_data["对语"].replace("丨", rhyme)
-    if "摘句" in r_data and r_data["摘句"]:
-        r_data["摘句"] = r_data["摘句"].replace("丨", rhyme)
-        
-    if "词条" in r_data:
-        new_citiao = {}
-        for word, content in r_data["词条"].items():
-            new_word = word.replace("丨", rhyme)
-            new_citiao[new_word] = content
-        r_data["词条"] = new_citiao
-
-with open('peiwenyunfu.json', 'w', encoding='utf-8') as f:
-    json.dump(data, f, ensure_ascii=False, indent=4)
-print("Done!")
--- a/佩文韵府/fix_structure.py
+++ b/佩文韵府/fix_structure.py
@@ -1,57 +0,0 @@
-import json
-
-print("Loading...")
-with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
-    data = json.load(f)
-
-new_data = {}
-new_data['metadata'] = data['metadata']
-new_data['preface'] = data['preface']
-
-prefixes = ["韻藻", "韵藻", "増", "增"]
-
-def clean_key(k):
-    changed = True
-    while changed:
-        changed = False
-        for p in prefixes:
-            if k.startswith(p) and len(k) > len(p):
-                k = k[len(p):]
-                changed = True
-    return k
-
-for rhyme, r_data in data.items():
-    if rhyme in ['metadata', 'preface']: continue
-    
-    # Store the original main rhyme
-    new_data[rhyme] = {
-        "卷": r_data["卷"],
-        "声": r_data["声"],
-        "韵": r_data["韵"],
-        "小韵描述": r_data["小韵描述"],
-        "韵藻": {},
-        "对语": r_data.get("对语", ""),
-        "摘句": r_data.get("摘句", "")
-    }
-    
-    current_rhyme = rhyme
-    
-    for k, v in r_data.get("词条", {}).items():
-        if len(k) == 1 and any(x in v[:15] for x in ['切', '説文', '廣韻', '玉篇', '集韻', '韻㑹', '音', '同', '釋名', '爾雅']):
-            current_rhyme = k
-            new_data[current_rhyme] = {
-                "卷": r_data["卷"],
-                "声": r_data["声"],
-                "韵": r_data["韵"],
-                "小韵描述": k + v,
-                "韵藻": {},
-                "对语": "",
-                "摘句": ""
-            }
-        else:
-            cleaned = clean_key(k)
-            new_data[current_rhyme]["韵藻"][cleaned] = v
-
-with open('peiwenyunfu_v2.json', 'w', encoding='utf-8') as f:
-    json.dump(new_data, f, ensure_ascii=False, indent=4)
-print(f"Old size: {len(data)}, New size: {len(new_data)}")
--- a/佩文韵府/fix_volume.py
+++ b/佩文韵府/fix_volume.py
@@ -1,28 +0,0 @@
-import json
-
-print("Loading peiwenyunfu.json...")
-with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
-    data = json.load(f)
-
-current_vol = ""
-fixed_count = 0
-
-for rhyme, r_data in data.items():
-    if rhyme in ['metadata', 'preface']: 
-        continue
-        
-    vol = r_data.get("卷", "")
-    if vol.strip():
-        # Update current valid volume
-        current_vol = vol.strip()
-    else:
-        # If volume is empty, use the current_vol
-        if current_vol:
-            r_data["卷"] = current_vol
-            fixed_count += 1
-
-print(f"Fixed {fixed_count} missing volumes.")
-
-with open('peiwenyunfu.json', 'w', encoding='utf-8') as f:
-    json.dump(data, f, ensure_ascii=False, indent=4)
-print("Saved to peiwenyunfu.json!")
--- a/佩文韵府/parse_all.py
+++ b/佩文韵府/parse_all.py
@@ -1,83 +0,0 @@
-import os
-import glob
-import json
-import re
-from parser import parse_html
-
-def natural_sort_key(s):
-    basename = os.path.basename(s)
-    # Match "卷XXX之YY" where XXX and YY can be numbers or Chinese numerals
-    # Actually they are already digits in most cases, e.g., "卷001之1.html" or "卷106之4.html"
-    m = re.search(r"卷(\d+)(?:之(\d+))?", basename)
-    if m:
-        vol = int(m.group(1))
-        sub = int(m.group(2)) if m.group(2) else 0
-        return (1, vol, sub)
-    # fallback
-    return (2, 0, 0)
-
-def main():
-    # Only pick files that start with "卷" to avoid "全覽" duplicate aggregations
-    files = glob.glob('html_files/卷*.html')
-    files.sort(key=natural_sort_key)
-    
-    print(f"Starting to parse {len(files)} files...")
-    
-    combined_result = {}
-    success_count = 0
-    fail_count = 0
-    
-    for idx, fpath in enumerate(files):
-        try:
-            res = parse_html(fpath)
-            for k, v in res.items():
-                # Remove "（韵母）" prefix
-                clean_key = k.replace("（韵母）", "")
-                
-                if clean_key not in combined_result:
-                    combined_result[clean_key] = v
-                else:
-                    # Merge entries
-                    combined_result[clean_key]["词条"].update(v.get("词条", {}))
-                    if "对语" in v and v["对语"]:
-                        combined_result[clean_key]["对语"] += v["对语"]
-                    if "摘句" in v and v["摘句"]:
-                        combined_result[clean_key]["摘句"] += v["摘句"]
-                        
-                    # Also, if the initial file didn't have "卷" properly parsed, update it
-                    if not combined_result[clean_key]["卷"] and v["卷"]:
-                        combined_result[clean_key]["卷"] = v["卷"]
-                    if not combined_result[clean_key]["声"] and v["声"]:
-                        combined_result[clean_key]["声"] = v["声"]
-                    if not combined_result[clean_key]["韵"] and v["韵"]:
-                        combined_result[clean_key]["韵"] = v["韵"]
-                        
-            success_count += 1
-            if idx % 50 == 0:
-                print(f"Parsed {idx}/{len(files)} files...")
-        except Exception as e:
-            print(f"Failed to parse {fpath}: {e}")
-            fail_count += 1
-            
-    print(f"Parsing finished. Success: {success_count}, Failed: {fail_count}")
-    print(f"Total unique rhyme characters extracted: {len(combined_result)}")
-    
-    # Construct final output with metadata
-    final_output = {
-        "metadata": {
-            "title": "御定佩文韵府",
-            "author": "张玉书等",
-            "dynasty": "清",
-            "total_volumes": 106,
-            "source": "2026年3月22日从维基文库导出"
-        },
-        "preface": "",
-        "content": combined_result
-    }
-    
-    with open('peiwenyunfu.json', 'w', encoding='utf-8') as f:
-        json.dump(final_output, f, ensure_ascii=False, indent=4)
-    print("Saved output to peiwenyunfu.json")
-
-if __name__ == "__main__":
-    main()
--- a/佩文韵府/parse_all_v2.py
+++ b/佩文韵府/parse_all_v2.py
@@ -1,79 +0,0 @@
-import os
-import glob
-import json
-import re
-from parser import parse_html
-
-def natural_sort_key(s):
-    basename = os.path.basename(s)
-    # Match "卷XXX之YY"
-    m = re.search(r"卷(\d+)(?:之(\d+))?", basename)
-    if m:
-        vol = int(m.group(1))
-        sub = int(m.group(2)) if m.group(2) else 0
-        return (1, vol, sub)
-    return (2, 0, 0)
-
-def main():
-    files = glob.glob('html_files/卷*.html')
-    files.sort(key=natural_sort_key)
-    
-    print(f"Starting to parse {len(files)} files...")
-    
-    combined_result = {}
-    success_count = 0
-    fail_count = 0
-    
-    for idx, fpath in enumerate(files):
-        try:
-            res = parse_html(fpath)
-            for k, v in res.items():
-                # Remove "（韵母）" prefix
-                clean_key = k.replace("（韵母）", "")
-                
-                if clean_key not in combined_result:
-                    combined_result[clean_key] = v
-                else:
-                    # Merge entries
-                    combined_result[clean_key]["词条"].update(v.get("词条", {}))
-                    if "对语" in v and v["对语"]:
-                        combined_result[clean_key]["对语"] += v["对语"]
-                    if "摘句" in v and v["摘句"]:
-                        combined_result[clean_key]["摘句"] += v["摘句"]
-                        
-                    if not combined_result[clean_key]["卷"] and v["卷"]:
-                        combined_result[clean_key]["卷"] = v["卷"]
-                    if not combined_result[clean_key]["声"] and v["声"]:
-                        combined_result[clean_key]["声"] = v["声"]
-                    if not combined_result[clean_key]["韵"] and v["韵"]:
-                        combined_result[clean_key]["韵"] = v["韵"]
-                        
-            success_count += 1
-            if idx % 50 == 0:
-                print(f"Parsed {idx}/{len(files)} files...")
-        except Exception as e:
-            print(f"Failed to parse {fpath}: {e}")
-            fail_count += 1
-            
-    print(f"Parsing finished. Success: {success_count}, Failed: {fail_count}")
-    print(f"Total unique rhyme characters extracted: {len(combined_result)}")
-    
-    final_output = {
-        "metadata": {
-            "title": "御定佩文韵府",
-            "author": "张玉书等奉敕编",
-            "dynasty": "清",
-            "total_volumes": 106,
-            "source": "2026年3月22日从维基文库导出"
-        },
-        "preface": ""
-    }
-    
-    final_output.update(combined_result)
-    
-    with open('peiwenyunfu.json', 'w', encoding='utf-8') as f:
-        json.dump(final_output, f, ensure_ascii=False, indent=4)
-    print("Saved output to peiwenyunfu.json")
-
-if __name__ == "__main__":
-    main()
--- a/佩文韵府/parse_html_logic.py
+++ b/佩文韵府/parse_html_logic.py
@@ -1,9 +0,0 @@
-def simplify_char(c):
-    mapping = {'東': '东', '銅': '铜', '童': '童'} # add others if needed
-    return mapping.get(c, c)
-
-num_map = {'一':'1', '二':'2', '三':'3', '四':'4', '五':'5', '六':'6', '七':'7', '八':'8', '九':'9'}
-
-# "01之一"
-# If vol_m = re.search(r"卷(.)之(.)", text)
-# "0" + num_map[vol_m.group(1)] + "之" + vol_m.group(2)
--- a/佩文韵府/parser.py
+++ b/佩文韵府/parser.py
@@ -1,124 +0,0 @@
-import re
-from bs4 import BeautifulSoup
-
-def simplify(text):
-    mapping = {
-        '東': '东',
-        '聲': '声',
-        '韻': '韵',
-        '詞': '词',
-        '條': '条',
-        '對': '对',
-        '語': '语',
-        '摘': '摘',
-        '句': '句',
-        '卷': '卷',
-        '紅': '红',
-        '銅': '铜',
-    }
-    for k, v in mapping.items():
-        text = text.replace(k, v)
-    return text
-
-def parse_html(file_path):
-    with open(file_path, "r", encoding="utf-8") as f:
-        soup = BeautifulSoup(f.read(), "html.parser")
-    
-    poem_div = soup.find("div", class_="poem")
-    if not poem_div:
-        return {}
-        
-    text = poem_div.get_text()
-    
-    # Extract Tone and Rhyme
-    tone_rhyme_m = re.search(r"(上平聲|下平聲|上聲|去聲|入聲)\s+(.*?韻.*)", text)
-    current_tone = simplify(tone_rhyme_m.group(1)) if tone_rhyme_m else ""
-    
-    if tone_rhyme_m:
-        raw_rhyme = tone_rhyme_m.group(2).strip()
-        m_rhyme = re.match(r"(.*?)[韻]", raw_rhyme)
-        current_rhyme = simplify(m_rhyme.group(1)) if m_rhyme else simplify(raw_rhyme.replace('韻', ''))
-    else:
-        current_rhyme = ""
-    
-    # Extract Volume
-    vol_m = re.search(r"卷([一二三四五六七八九十]+)之(.+)", text)
-    if vol_m:
-        num_map = {'一':'1', '二':'2', '三':'3', '四':'4', '五':'5', '六':'6', '七':'7', '八':'8', '九':'9', '十':'10'}
-        v1 = vol_m.group(1)
-        v1_digit = "".join(num_map.get(c, c) for c in v1)
-        if len(v1_digit) == 1:
-            v1_str = f"0{v1_digit}"
-        else:
-            v1_str = v1_digit
-        # Fix: the volume match might capture extra text after '之', e.g. '之一\n'
-        v2 = vol_m.group(2).split('\n')[0].strip()
-        current_vol = f"{v1_str}之{v2}"
-    else:
-        current_vol = ""
-        
-    # Extract chars
-    lines = text.split('\n')
-    rhyme_chars = []
-    for i, line in enumerate(lines):
-        if tone_rhyme_m and tone_rhyme_m.group(2) in line:
-            chars_line = lines[i+1]
-            rhyme_chars = [c for c in chars_line.replace('　', ' ').split() if len(c) == 1]
-            break
-            
-    clean_lines = []
-    for line in lines:
-        stripped = line.strip()
-        if not stripped: continue
-        if stripped == "欽定四庫全書": continue
-        if stripped.startswith("御定佩文韻府"): continue
-        if "平聲" in stripped or "上聲" in stripped or "去聲" in stripped or "入聲" in stripped:
-            if "韻" in stripped: continue
-        if all(c in rhyme_chars or c in ' 　' for c in stripped):
-            continue
-        clean_lines.append(stripped)
-
-    clean_text = "".join(clean_lines)
-    
-    tokens = re.findall(r"([^〈〉]*)((?:〈[^〉]+〉)+)", clean_text)
-    
-    result = {}
-    current_char = None
-    
-    for word, desc_blocks in tokens:
-        word = word.strip()
-        desc_content = desc_blocks.replace('〈', '').replace('〉', '')
-        
-        # Is this a main character definition?
-        if word in rhyme_chars:
-            current_char = word
-            simplified_char = simplify(current_char)
-            # Create a new entry in result
-            key = f"（韵母）{simplified_char}"
-            if key not in result:
-                result[key] = {
-                    "卷": current_vol,
-                    "声": current_tone,
-                    "韵": current_rhyme,
-                    "小韵描述": simplify(current_char + desc_content),
-                    "词条": {},
-                    "对语": "",
-                    "摘句": ""
-                }
-        elif word == "對語" or word == "对语":
-            if current_char:
-                key = f"（韵母）{simplify(current_char)}"
-                # Could be multiple parts, though usually one per character block
-                result[key]["对语"] += desc_content
-        elif word == "摘句":
-            if current_char:
-                key = f"（韵母）{simplify(current_char)}"
-                result[key]["摘句"] += desc_content
-        else:
-            # It's a 词条
-            if current_char and word:
-                key = f"（韵母）{simplify(current_char)}"
-                result[key]["词条"][word] = desc_content
-                
-    return result
-
--- a/佩文韵府/robust_download.py
+++ b/佩文韵府/robust_download.py
@@ -1,65 +0,0 @@
-import os
-import time
-import urllib.parse
-import requests
-from requests.exceptions import RequestException
-
-
-def main():
-    if not os.path.exists("html_files"):
-        os.makedirs("html_files")
-
-    with open("missing_urls.txt", "r", encoding="utf-8") as f:
-        urls = [line.strip() for line in f if line.strip()]
-
-    proxies = {"http": "http://127.0.0.1:10808", "https": "http://127.0.0.1:10808"}
-
-    headers = {
-        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
-    }
-
-    for url in urls:
-        # Decode the URL to get the original characters
-        decoded_url = urllib.parse.unquote(url)
-        # Extract the volume name, e.g., 卷001之1
-        volume_name = decoded_url.split("/")[-1]
-
-        filename = f"html_files/{volume_name}.html"
-
-        if os.path.exists(filename):
-            print(f"Skipping {filename}, already exists.")
-            continue
-
-        print(f"Downloading {volume_name} from {url}...")
-
-        success = False
-        retries = 3
-        while not success and retries > 0:
-            try:
-                response = requests.get(
-                    url, headers=headers, proxies=proxies, timeout=15
-                )
-                if response.status_code == 200:
-                    with open(filename, "w", encoding="utf-8") as out_f:
-                        out_f.write(response.text)
-                    success = True
-                    print(f"Successfully downloaded {filename}")
-                else:
-                    print(
-                        f"HTTP Error {response.status_code} for {url}. Retries left: {retries - 1}"
-                    )
-            except RequestException as e:
-                print(f"Error downloading {url}: {e}. Retries left: {retries - 1}")
-
-            if not success:
-                retries -= 1
-                time.sleep(2)  # Delay before retry
-
-        if not success:
-            print(f"Failed to download {url} after all retries.")
-
-        time.sleep(0.5)  # Small delay between requests
-
-
-if __name__ == "__main__":
-    main()
--- a/佩文韵府/robust_download_aio.py
+++ b/佩文韵府/robust_download_aio.py
@@ -1,65 +0,0 @@
-import asyncio
-import aiohttp
-from aiohttp_socks import ProxyConnector, ProxyType
-import os
-import urllib.parse
-import time
-
-
-async def download_file(session, url, filename, semaphore):
-    async with semaphore:
-        if os.path.exists(filename):
-            print(f"Skipping {filename}")
-            return True
-
-        retries = 3
-        while retries > 0:
-            try:
-                async with session.get(url, timeout=30) as response:
-                    if response.status == 200:
-                        content = await response.read()
-                        with open(filename, "wb") as f:
-                            f.write(content)
-                        print(f"Successfully downloaded {filename}")
-                        return True
-                    else:
-                        print(f"HTTP Error {response.status} for {url}")
-            except Exception as e:
-                print(f"Error for {url}: {e}")
-
-            retries -= 1
-            await asyncio.sleep(2)
-
-        print(f"Failed all retries for {url}")
-        return False
-
-
-async def main():
-    if not os.path.exists("html_files"):
-        os.makedirs("html_files")
-
-    with open("missing_urls.txt", "r", encoding="utf-8") as f:
-        urls = [line.strip() for line in f if line.strip()]
-
-    # To satisfy the instruction: SOCKS5 proxy 127.0.0.1:10808
-    # aiohttp_socks uses socks5:// instead of socks5h:// but with rdns=True it is equivalent.
-    proxy_url = "socks5://127.0.0.1:10808"
-    connector = ProxyConnector.from_url(proxy_url, rdns=True)
-
-    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
-
-    semaphore = asyncio.Semaphore(10)  # 10 concurrent downloads
-
-    async with aiohttp.ClientSession(connector=connector, headers=headers) as session:
-        tasks = []
-        for url in urls:
-            decoded_url = urllib.parse.unquote(url)
-            volume_name = decoded_url.split("/")[-1]
-            filename = f"html_files/{volume_name}.html"
-            tasks.append(download_file(session, url, filename, semaphore))
-
-        await asyncio.gather(*tasks)
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
--- a/佩文韵府/robust_download_fast.py
+++ b/佩文韵府/robust_download_fast.py
@@ -1,65 +0,0 @@
-import os
-import time
-import urllib.parse
-import requests
-from requests.exceptions import RequestException
-import concurrent.futures
-
-def download_url(url):
-    proxies = {"http": "http://127.0.0.1:10808", "https": "http://127.0.0.1:10808"}
-    headers = {
-        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
-    }
-    
-    decoded_url = urllib.parse.unquote(url)
-    volume_name = decoded_url.split("/")[-1]
-    filename = f"html_files/{volume_name}.html"
-
-    if os.path.exists(filename):
-        # Already exists
-        return True, filename
-
-    success = False
-    retries = 3
-    while not success and retries > 0:
-        try:
-            response = requests.get(url, headers=headers, proxies=proxies, timeout=15)
-            if response.status_code == 200:
-                with open(filename, "w", encoding="utf-8") as out_f:
-                    out_f.write(response.text)
-                success = True
-                print(f"Successfully downloaded {filename}")
-                return True, filename
-            else:
-                print(f"HTTP Error {response.status_code} for {url}. Retries left: {retries - 1}")
-        except RequestException as e:
-            pass
-        
-        if not success:
-            retries -= 1
-            time.sleep(2)
-
-    if not success:
-        print(f"Failed to download {url} after all retries.")
-        return False, filename
-    
-    return False, filename
-
-def main():
-    if not os.path.exists("html_files"):
-        os.makedirs("html_files")
-
-    with open("missing_urls.txt", "r", encoding="utf-8") as f:
-        urls = [line.strip() for line in f if line.strip()]
-
-    # Use ThreadPoolExecutor to download multiple files concurrently
-    with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
-        futures = [executor.submit(download_url, url) for url in urls]
-        for future in concurrent.futures.as_completed(futures):
-            try:
-                future.result()
-            except Exception as e:
-                print(f"Exception during download: {e}")
-
-if __name__ == "__main__":
-    main()
--- a/佩文韵府/robust_download_final.py
+++ b/佩文韵府/robust_download_final.py
@@ -1,79 +0,0 @@
-import os
-import time
-import urllib.parse
-import requests
-from requests.exceptions import RequestException
-import concurrent.futures
-
-
-def download_url(url):
-    proxies = {"http": "http://127.0.0.1:10808", "https": "http://127.0.0.1:10808"}
-    headers = {
-        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
-        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
-        "Accept-Language": "en-US,en;q=0.9",
-    }
-
-    decoded_url = urllib.parse.unquote(url)
-    volume_name = decoded_url.split("/")[-1]
-    filename = f"html_files/{volume_name}.html"
-
-    if os.path.exists(filename):
-        return True, filename
-
-    success = False
-    retries = 5
-    while not success and retries > 0:
-        try:
-            response = requests.get(url, headers=headers, proxies=proxies, timeout=15)
-            if response.status_code == 200:
-                with open(filename, "w", encoding="utf-8") as out_f:
-                    out_f.write(response.text)
-                success = True
-                print(f"Successfully downloaded {filename}")
-                return True, filename
-            elif response.status_code == 403:
-                print(
-                    f"HTTP Error 403 for {url}. Waiting longer to avoid rate limit..."
-                )
-                time.sleep(5)
-            else:
-                print(
-                    f"HTTP Error {response.status_code} for {url}. Retries left: {retries - 1}"
-                )
-        except RequestException as e:
-            pass
-
-        if not success:
-            retries -= 1
-            time.sleep(3)
-
-    if not success:
-        print(f"Failed to download {url} after all retries.")
-        return False, filename
-
-    return False, filename
-
-
-def main():
-    if not os.path.exists("html_files"):
-        os.makedirs("html_files")
-
-    with open("still_missing.txt", "r", encoding="utf-8") as f:
-        urls = [line.strip() for line in f if line.strip()]
-
-    print(f"Starting download for {len(urls)} remaining files...")
-
-    # Use ThreadPoolExecutor to download multiple files concurrently
-    # Reduced max_workers to 5 to avoid triggering 403 Forbidden rate limits
-    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
-        futures = [executor.submit(download_url, url) for url in urls]
-        for future in concurrent.futures.as_completed(futures):
-            try:
-                future.result()
-            except Exception as e:
-                print(f"Exception during download: {e}")
-
-
-if __name__ == "__main__":
-    main()
--- a/佩文韵府/still_missing.txt
+++ b/佩文韵府/still_missing.txt
@@ -1,96 +0,0 @@
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7070%E4%B9%8B4
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7073%E4%B9%8B1
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7073%E4%B9%8B2
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7073%E4%B9%8B3
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7074%E4%B9%8B4
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7074%E4%B9%8B5
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7077%E4%B9%8B4
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7081%E4%B9%8B1
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7081%E4%B9%8B2
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7081%E4%B9%8B3
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7082%E4%B9%8B4
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7082%E4%B9%8B5
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7083%E4%B9%8B1
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7083%E4%B9%8B2
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7083%E4%B9%8B3
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7083%E4%B9%8B4
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7084%E4%B9%8B1
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7084%E4%B9%8B2
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7085%E4%B9%8B1
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7085%E4%B9%8B2
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7085%E4%B9%8B3
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7085%E4%B9%8B4
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B02
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B03
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B04
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B05
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B06
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B07
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B08
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B09
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B10
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7091%E4%B9%8B1
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7091%E4%B9%8B2
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7091%E4%B9%8B3
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7091%E4%B9%8B4
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7092%E4%B9%8B1
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7092%E4%B9%8B2
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7092%E4%B9%8B4
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7093%E4%B9%8B1
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7093%E4%B9%8B2
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7093%E4%B9%8B6
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7093%E4%B9%8B7
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7093%E4%B9%8B8
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7094%E4%B9%8B1
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7094%E4%B9%8B2
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7095%E4%B9%8B1
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7095%E4%B9%8B2
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7095%E4%B9%8B3
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7095%E4%B9%8B4
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7095%E4%B9%8B5
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7095%E4%B9%8B6
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7096%E4%B9%8B1
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7096%E4%B9%8B2
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7096%E4%B9%8B3
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7098%E4%B9%8B1
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7098%E4%B9%8B2
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7098%E4%B9%8B4
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7098%E4%B9%8B5
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7099%E4%B9%8B2
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7099%E4%B9%8B3
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7099%E4%B9%8B5
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7099%E4%B9%8B7
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7099%E4%B9%8B8
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7099%E4%B9%8B9
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7099%E4%B9%8B010
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B1
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B2
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B3
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B4
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B6
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B7
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B8
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B9
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B010
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B011
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B012
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7101%E4%B9%8B1
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7101%E4%B9%8B2
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7101%E4%B9%8B3
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7101%E4%B9%8B4
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7102%E4%B9%8B1
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7102%E4%B9%8B2
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7102%E4%B9%8B3
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7102%E4%B9%8B4
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7102%E4%B9%8B5
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7102%E4%B9%8B6
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7102%E4%B9%8B7
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7102%E4%B9%8B8
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7103%E4%B9%8B1
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7103%E4%B9%8B2
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7104%E4%B9%8B1
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7104%E4%B9%8B2
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7105%E4%B9%8B1
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7105%E4%B9%8B2
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7105%E4%B9%8B3
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7106%E4%B9%8B2
--- a/佩文韵府/test_clean.py
+++ b/佩文韵府/test_clean.py
@@ -1,53 +0,0 @@
-import re
-from bs4 import BeautifulSoup
-
-with open("html_files/卷001之1.html", "r", encoding="utf-8") as f:
-    soup = BeautifulSoup(f.read(), "html.parser")
-
-text = soup.find("div", class_="poem").get_text()
-
-# Extract Tone and Rhyme
-tone_rhyme_m = re.search(r"(上平聲|下平聲|上聲|去聲|入聲)\s+(.*?韻.*)", text)
-if tone_rhyme_m:
-    print("Tone:", tone_rhyme_m.group(1))
-    print("Rhyme:", tone_rhyme_m.group(2))
-
-# Extract Volume
-vol_m = re.search(r"卷(.)之(.)", text)
-if vol_m:
-    print("Volume:", f"0{vol_m.group(1)}之{vol_m.group(2)}" if vol_m.group(1) in '一二三四五六七八九' else f"{vol_m.group(1)}之{vol_m.group(2)}")
-
-# Extract chars
-lines = text.split('\n')
-rhyme_chars = []
-for i, line in enumerate(lines):
-    if tone_rhyme_m.group(2) in line:
-        # the next line usually has the characters
-        chars_line = lines[i+1]
-        rhyme_chars = [c for c in chars_line.replace('　', ' ').split() if len(c) == 1]
-        break
-
-print("Chars:", rhyme_chars)
-
-# Now, we want to strip all lines that are headers.
-# Actually, headers repeat: "欽定四庫全書", "御定佩文韻府...", "上平聲..."
-# We can just filter out these known header lines!
-clean_lines = []
-for line in lines:
-    stripped = line.strip()
-    if not stripped: continue
-    if stripped == "欽定四庫全書": continue
-    if stripped.startswith("御定佩文韻府"): continue
-    if "平聲" in stripped or "上聲" in stripped or "去聲" in stripped or "入聲" in stripped:
-        if "韻" in stripped: continue
-    # Is it the chars line?
-    if all(c in rhyme_chars or c in ' 　' for c in stripped):
-        continue
-    clean_lines.append(stripped)
-
-clean_text = "".join(clean_lines)
-print("Start of clean text:", clean_text[:100])
-
-tokens = re.findall(r"([^〈〉]*)((?:〈[^〉]+〉)+)", clean_text)
-print("First token:", tokens[0])
-
--- a/佩文韵府/test_dump.py
+++ b/佩文韵府/test_dump.py
@@ -1,10 +0,0 @@
-from bs4 import BeautifulSoup
-
-with open("html_files/卷001之1.html", "r", encoding="utf-8") as f:
-    soup = BeautifulSoup(f.read(), "html.parser")
-
-poem_div = soup.find("div", class_="poem")
-if poem_div:
-    for i, p in enumerate(poem_div.find_all("p")[:20]):
-        print(f"--- P {i} ---")
-        print(p.text[:100].replace('\n', ' '))
--- a/佩文韵府/test_dump2.py
+++ b/佩文韵府/test_dump2.py
@@ -1,12 +0,0 @@
-import re
-from bs4 import BeautifulSoup
-
-with open("html_files/卷001之1.html", "r", encoding="utf-8") as f:
-    soup = BeautifulSoup(f.read(), "html.parser")
-
-poem_div = soup.find("div", class_="poem")
-if poem_div:
-    lines = poem_div.get_text().split("\n")
-    lines = [line.strip() for line in lines if line.strip()]
-    for i, line in enumerate(lines[:50]):
-        print(f"{i}: {line}")
--- a/佩文韵府/test_empty_word.py
+++ b/佩文韵府/test_empty_word.py
@@ -1,5 +0,0 @@
-import re
-text = "對語〈渭北　江東〉〈　平北　安東〉摘句〈力障百川東〉"
-tokens = re.findall(r"([^〈〉]*)((?:〈[^〉]+〉)+)", text)
-for i, (word, desc_blocks) in enumerate(tokens):
-    print(f"Token {i}: WORD='{word}' DESCS={desc_blocks}")
--- a/佩文韵府/test_heuristic.py
+++ b/佩文韵府/test_heuristic.py
@@ -1,54 +0,0 @@
-import json
-import re
-
-with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
-    data = json.load(f)
-
-prefixes = ["韻藻", "增"]
-
-def replace_pipes(content, word):
-    clean_word = word
-    for p in prefixes:
-        if clean_word.startswith(p) and len(clean_word) > len(p):
-            clean_word = clean_word[len(p):]
-            break # only strip one prefix
-            
-    word_len = len(clean_word)
-    if word_len == 0:
-        return content.replace("丨", "")
-        
-    result = []
-    pipe_idx = 0
-    chars_since_last_pipe = 0
-    
-    for char in content:
-        if char == "丨":
-            if chars_since_last_pipe >= 5:
-                # Long gap -> reset pipe_idx!
-                # Wait, only reset if we aren't in the middle of a perfect mapping?
-                # Actually, if the gap is >=5, it's definitely a new occurrence.
-                pipe_idx = 0
-            
-            result.append(clean_word[pipe_idx % word_len])
-            pipe_idx += 1
-            chars_since_last_pipe = 0
-        else:
-            result.append(char)
-            chars_since_last_pipe += 1
-            
-    return "".join(result)
-
-# Test specific words
-test_cases = [
-    ("首陽東", "詩采葑采葑丨丨之丨"),
-    ("馬首東", "左傳欒黶曰吾丨丨欲丨乃歸下軍從之"),
-    ("澗瀍東", "書洛誥我乃卜丨水東丨水西惟洛食我又卜瀍水丨亦惟洛食"),
-    ("日夜東", "丨丨虞集詩絳桃風急丨丨丨王惲詩付與衡漳丨丨丨許有壬詩江水舟"),
-    ("東海東", "樓鑰詩萬里逺在丨丨丨張經詩崑崙之西丨"),
-]
-
-for w, c in test_cases:
-    print(f"Word: {w}")
-    print(f"Orig: {c}")
-    print(f"Fix : {replace_pipes(c, w)}")
-    print("-" * 40)
--- a/佩文韵府/test_heuristic2.py
+++ b/佩文韵府/test_heuristic2.py
@@ -1,37 +0,0 @@
-import json
-
-prefixes = ["韻藻", "增"]
-
-def replace_pipes_no_reset(content, word):
-    clean_word = word
-    for p in prefixes:
-        if clean_word.startswith(p) and len(clean_word) > len(p):
-            clean_word = clean_word[len(p):]
-            break
-            
-    word_len = len(clean_word)
-    if word_len == 0:
-        return content.replace("丨", "")
-        
-    result = []
-    pipe_idx = 0
-    for char in content:
-        if char == "丨":
-            result.append(clean_word[pipe_idx % word_len])
-            pipe_idx += 1
-        else:
-            result.append(char)
-    return "".join(result)
-
-test_cases = [
-    ("首陽東", "詩采葑采葑丨丨之丨"),
-    ("馬首東", "左傳欒黶曰吾丨丨欲丨乃歸下軍從之"),
-    ("澗瀍東", "書洛誥我乃卜丨水東丨水西惟洛食我又卜瀍水丨亦惟洛食"),
-    ("日夜東", "丨丨虞集詩絳桃風急丨丨丨王惲詩付與衡漳丨丨丨許有壬詩江水舟"),
-    ("東海東", "樓鑰詩萬里逺在丨丨丨張經詩崑崙之西丨"),
-]
-
-for w, c in test_cases:
-    print(f"Word: {w}")
-    print(f"NoRst: {replace_pipes_no_reset(c, w)}")
-    print("-" * 40)
--- a/佩文韵府/test_hybrid.py
+++ b/佩文韵府/test_hybrid.py
@@ -1,56 +0,0 @@
-import json
-import re
-
-prefixes = ["韻藻", "增"]
-
-def replace_pipes_hybrid(content, word):
-    clean_word = word
-    for p in prefixes:
-        if clean_word.startswith(p) and len(clean_word) > len(p):
-            clean_word = clean_word[len(p):]
-            break
-            
-    word_len = len(clean_word)
-    if word_len == 0:
-        return content.replace("丨", "")
-        
-    def repl(match):
-        nonlocal pipe_idx
-        block = match.group(0)
-        block_len = len(block)
-        
-        if block_len % word_len == 0:
-            # Full word match! Reset alignment.
-            pipe_idx = 0
-            return clean_word * (block_len // word_len)
-        else:
-            # Partial word match. Use current sequence.
-            res = ""
-            for _ in range(block_len):
-                res += clean_word[pipe_idx % word_len]
-                pipe_idx += 1
-            return res
-
-    pipe_idx = 0
-    return re.sub(r'丨+', repl, content)
-
-test_cases = [
-    ("首陽東", "詩采葑采葑丨丨之丨"),
-    ("馬首東", "左傳欒黶曰吾丨丨欲丨乃歸下軍從之"),
-    ("澗瀍東", "書洛誥我乃卜丨水東丨水西惟洛食我又卜瀍水丨亦惟洛食"),
-    ("日夜東", "丨丨虞集詩絳桃風急丨丨丨王惲詩付與衡漳丨丨丨許有壬詩江水舟"),
-    ("東海東", "樓鑰詩萬里逺在丨丨丨張經詩崑崙之西丨"),
-]
-
-for w, c in test_cases:
-    print(f"Word: {w}")
-    print(f"Orig: {c}")
-    print(f"Hybr: {replace_pipes_hybrid(c, w)}")
-    print("-" * 40)
-test_cases.append(("紫殿東", "時魚躍丨丨丨温庭筠詩一夕丨丨"))
-
-for w, c in test_cases[-1:]:
-    print(f"Word: {w}")
-    print(f"Orig: {c}")
-    print(f"Hybr: {replace_pipes_hybrid(c, w)}")
-    print("-" * 40)
--- a/佩文韵府/test_parser.py
+++ b/佩文韵府/test_parser.py
@@ -1,31 +0,0 @@
-import json
-from parser import parse_html
-
-def test_parse_html():
-    file_path = "html_files/卷001之1.html"
-    result = parse_html(file_path)
-    
-    # Save for manual inspection
-    with open("output.json", "w", encoding="utf-8") as f:
-        json.dump(result, f, ensure_ascii=False, indent=2)
-
-    # Check that it returns a dictionary
-    assert isinstance(result, dict)
-    
-    # Let's see what keys are in it
-    keys = list(result.keys())
-    print("Keys found:", keys)
-    
-    if len(keys) > 0:
-        first_key = keys[0]
-        assert "卷" in result[first_key]
-        assert "声" in result[first_key]
-        assert "韵" in result[first_key]
-        assert "小韵描述" in result[first_key]
-        assert "词条" in result[first_key]
-        assert "对语" in result[first_key]
-        assert "摘句" in result[first_key]
-
-if __name__ == "__main__":
-    test_parse_html()
-    print("Tests passed!")
--- a/佩文韵府/test_placeholder.py
+++ b/佩文韵府/test_placeholder.py
@@ -1,49 +0,0 @@
-import json
-import re
-
-with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
-    data = json.load(f)
-
-# Analyze a few
-count_match = 0
-count_mismatch = 0
-
-for rhyme, r_data in list(data.items())[:5]: # Skip metadata, preface
-    if rhyme in ["metadata", "preface"]:
-        continue
-    print(f"\nRhyme: {rhyme}")
-    
-    # 1. 小韵描述
-    desc = r_data.get("小韵描述", "")
-    desc_fixed = desc.replace("丨", rhyme)
-    print(f"Desc original: {desc[:30]}...")
-    print(f"Desc fixed:    {desc_fixed[:30]}...")
-    
-    # 2. 词条
-    for word, content in list(r_data.get("词条", {}).items())[:5]:
-        pipe_count = content.count("丨")
-        word_len = len(word)
-        if pipe_count == 0:
-            continue
-        
-        print(f"Word: {word} (len {word_len}), pipes: {pipe_count}")
-        print(f"Original: {content}")
-        
-        # Test replacing
-        if pipe_count % word_len == 0:
-            # We can replace them in groups of word_len
-            fixed_content = ""
-            pipe_idx = 0
-            for char in content:
-                if char == "丨":
-                    fixed_content += word[pipe_idx % word_len]
-                    pipe_idx += 1
-                else:
-                    fixed_content += char
-            print(f"Fixed:    {fixed_content}")
-            count_match += 1
-        else:
-            print("MISMATCH length!")
-            count_mismatch += 1
-
-print(f"\nMatches: {count_match}, Mismatches: {count_mismatch}")
--- a/佩文韵府/test_playwright.py
+++ b/佩文韵府/test_playwright.py
@@ -1,46 +0,0 @@
-import asyncio
-from playwright.async_api import async_playwright
-import urllib.parse
-import sys
-
-
-async def main():
-    url = sys.argv[1]
-    proxy_server = "socks5://127.0.0.1:10808"
-
-    async with async_playwright() as p:
-        # Try with proxy
-        browser = await p.chromium.launch(headless=True, proxy={"server": proxy_server})
-        context = await browser.new_context(
-            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
-        )
-        page = await context.new_page()
-
-        try:
-            print(f"Loading {url} via proxy...")
-            await page.goto(url, timeout=30000, wait_until="domcontentloaded")
-            content = await page.content()
-            print(f"Success! Content length: {len(content)}")
-        except Exception as e:
-            print(f"Error via proxy: {e}")
-            await browser.close()
-
-            # Try without proxy
-            print("Retrying without proxy...")
-            browser = await p.chromium.launch(headless=True)
-            context = await browser.new_context(
-                user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
-            )
-            page = await context.new_page()
-            try:
-                await page.goto(url, timeout=30000, wait_until="domcontentloaded")
-                content = await page.content()
-                print(f"Success! Content length: {len(content)}")
-            except Exception as e2:
-                print(f"Error without proxy: {e2}")
-
-        await browser.close()
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
--- a/佩文韵府/test_prefixes.py
+++ b/佩文韵府/test_prefixes.py
@@ -1,21 +0,0 @@
-import json
-
-with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
-    data = json.load(f)
-
-mismatches = []
-prefixes_found = set()
-
-for rhyme, r_data in data.items():
-    if rhyme in ["metadata", "preface"]:
-        continue
-    for word, content in r_data.get("词条", {}).items():
-        pipe_count = content.count("丨")
-        if pipe_count == 0:
-            continue
-        if pipe_count % len(word) != 0:
-            mismatches.append((word, pipe_count))
-
-print(f"Total mismatches: {len(mismatches)}")
-for w, p in mismatches[:20]:
-    print(f"{w} (len {len(w)}), pipes: {p}")
--- a/佩文韵府/test_specific.py
+++ b/佩文韵府/test_specific.py
@@ -1,13 +0,0 @@
-import json
-
-with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
-    data = json.load(f)
-
-for rhyme, r_data in data.items():
-    if rhyme in ["metadata", "preface"]:
-        continue
-    for word, content in r_data.get("词条", {}).items():
-        if word in ["紫殿東", "少微東", "東海東", "日夜東", "隔西東"]:
-            print(f"Word: {word}")
-            print(f"Content: {content}")
-            print("-" * 40)
--- a/佩文韵府/test_split.py
+++ b/佩文韵府/test_split.py
@@ -1,18 +0,0 @@
-import re
-from bs4 import BeautifulSoup
-
-with open("html_files/卷001之1.html", "r", encoding="utf-8") as f:
-    soup = BeautifulSoup(f.read(), "html.parser")
-
-poem_div = soup.find("div", class_="poem")
-text = poem_div.get_text()
-
-# Extract the list of characters.
-# It appears after "一東韻一" or similar.
-m = re.search(r"(上平聲|下平聲|上聲|去聲|入聲)\s+(.*?韻.*?)\n(.*?)\n", text)
-if m:
-    print("Tone:", m.group(1))
-    print("Rhyme:", m.group(2))
-    print("Chars line:", m.group(3))
-    rhyme_chars = [c for c in m.group(3).replace('　', ' ').split() if len(c) == 1]
-    print("Chars:", rhyme_chars)
--- a/佩文韵府/test_strip.py
+++ b/佩文韵府/test_strip.py
@@ -1,32 +0,0 @@
-import json
-import re
-
-with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
-    data = json.load(f)
-
-prefixes = ["韻藻", "增"]
-
-mismatches = []
-total_pipes = 0
-
-for rhyme, r_data in data.items():
-    if rhyme in ["metadata", "preface"]:
-        continue
-    for word, content in r_data.get("词条", {}).items():
-        clean_word = word
-        for p in prefixes:
-            if clean_word.startswith(p) and len(clean_word) > len(p):
-                clean_word = clean_word[len(p):]
-                
-        pipe_count = content.count("丨")
-        if pipe_count == 0:
-            continue
-        total_pipes += 1
-        if pipe_count % len(clean_word) != 0:
-            mismatches.append((word, clean_word, pipe_count, content))
-
-print(f"Total entries with pipes: {total_pipes}")
-print(f"Total mismatches after stripping: {len(mismatches)}")
-for w, cw, p, c in mismatches[:10]:
-    print(f"{w} -> {cw} (len {len(cw)}), pipes: {p}")
-    print(f"  {c}")
--- a/佩文韵府/test_tokenize.py
+++ b/佩文韵府/test_tokenize.py
@@ -1,21 +0,0 @@
-import re
-from bs4 import BeautifulSoup
-
-with open("html_files/卷001之1.html", "r", encoding="utf-8") as f:
-    soup = BeautifulSoup(f.read(), "html.parser")
-
-text = soup.find("div", class_="poem").get_text().replace('\n', '')
-
-# Remove header junk for this test (just find the first '東〈')
-start_idx = text.find('東〈')
-text = text[start_idx:]
-
-# Tokenize into pairs of (Word, Description)
-# Using regex to find all Word〈Description〉
-# Wait, multiple 〈 〉 can follow a word like 對語〈...〉〈...〉
-# We can find all chunks of non-〈 characters, followed by one or more 〈...〉
-tokens = re.findall(r"([^〈〉]*)((?:〈[^〉]+〉)+)", text)
-
-for i, (word, desc_blocks) in enumerate(tokens[:20]):
-    print(f"Token {i}: WORD='{word}' DESCS={desc_blocks[:30]}...")
-
--- a/分类.md
+++ b/分类.md
@@ -0,0 +1,93 @@
+# 类书分类与特点梳理
+
+为辅助专业诗歌创作小程序提供“知识库”，整理现有类书数据的优缺点、特点及在辅助创作中的核心价值如下：
+
+## 一、词条式
+这类类书通常以主题（如天文地理、自然物候等）分类，适合根据意象（如“雪”、“月”）进行正向检索找语料。
+
+- **《白孔六帖》**
+  - **内容与编排顺序**：按天文地理、历法礼仪、生活物件等类目编排。
+  - **特点**：采辑各种典籍中的成语、典故、短语，多为四字短句，并附带简短的释义。
+  - **整理问题**：提取很不干净，正文与释义括号混合在一起且缺乏断句。
+  - **辅助创作价值**：可作为提炼“四字骈语”或冷僻典故的原始素材库，但需大量NLP清洗。
+  - **典型例子**：`"content": "白髙眀柔克(髙明天也柔克寒暑不干)隂隲下民(言天黙定下民之命)天尊(地卑)..."`
+
+- **《北堂书钞》**
+  - **内容与编排顺序**：按帝王、后妃、政术、刑法、官职、礼仪等社会制度与名物编排。
+  - **特点**：成书于隋，引用起自三代、汉、魏，迄于宋、齐。侧重于对概念的追本溯源，对词条解释较为详细。
+  - **整理问题**：JSON提取的正文部分缺乏标点，长句粘连。
+  - **辅助创作价值**：古朴凝练。如创作者需要引用较为正统的经史概念（如写咏史诗），该书能提供最原汁原味的早期语料。
+  - **典型例子**：`"content": "皇者天人之總稱 帝者天號 正氣爲帝 帝者天下之所適王者天下之所徃也..."`
+
+- **《初学记》**
+  - **内容与编排顺序**：按天、岁时、官职、地理等编排。每个词条下细分为“叙事”、“事对”、“诗文”。
+  - **特点**：词条内容非常详细、层次分明，具有极强的结构化特征。
+  - **整理问题**：整理得非常干净，JSON层级保留了原始分类结构。
+  - **辅助创作价值**：**价值极高**。“事对”直接提供了现成的对仗词汇（写诗利器）；“诗文”栏目则方便查阅前人咏此物的范本。
+  - **典型例子**：`"事对": "轉葢　倚杵（桓譚新論天如葢轉左旋...） 覆盆　轉轂（王充論衡曰...）"`
+
+- **《海录碎事》**
+  - **内容与编排顺序**：按非常细碎的关键词（天、地、衣冠等）分类。
+  - **特点**：每一类下词条过细（常为生僻两字词），每一词条下内容极少，通常只有一两句包含该词的引文。
+  - **整理问题**：词条过度碎片化，键名就是细碎词汇。
+  - **辅助创作价值**：相当于一个“逆向用词示例库”。诗人想用某个生僻意象时，用它查看古人如何将其嵌入诗句中。
+  - **典型例子**：`"曽穹": [{"content": "蹀足循廣除瞬目矖曽穹（文選謝惠連詩）"}]`
+
+- **《骈字类编》**
+  - **内容与编排顺序**：按天地、时令、山水、珍宝、器物等词汇大类编排。
+  - **特点**：专收“骈语”（双音节词），词条极多，详细列出了该词在各路经史子集中出现的位置。
+  - **整理问题**：长段引文粘连，缺少现代标点。
+  - **辅助创作价值**：**价值极高**。古诗词创作最核心的就是对“双字词汇”的拿捏，此书就是一个庞大且天然的古典双字词语境库。
+  - **典型例子**：`"天地": "易干夫大人者与天地合其德 又坤天地变化草木蕃天地闭贤人隐..."`
+
+- **《太平御览》**
+  - **内容与编排顺序**：以天、地、人、事、物为大类顺序。
+  - **特点**：在前代《修文殿御览》《艺文类聚》等书基础上编纂而成，包罗万象，词条内容全部为原文引文。
+  - **整理问题**：带有原书排版格式（换行、`《书名》曰`），阅读体验较佳。
+  - **辅助创作价值**：提供最详实的事物背景知识，适合在需要了解某个意象（如“雪”）的全面历史文化背景时使用。
+  - **典型例子**：`"《三五曆記》曰：未有天地之時，混沌狀如雞子，溟涬始牙..."`
+
+- **《艺文类聚》**
+  - **内容与编排顺序**：按天、岁时、地理、帝王、人、乐、职官等编排。
+  - **特点**：事文交织。词条收录较广，既有经史中的“叙事”，也有大量的历代【诗】、【赋】、【赞】。
+  - **整理问题**：带有原书的标点和分段标记，格式清晰。
+  - **辅助创作价值**：极好的文学创作资料库，帮助创作者一站式看到某个主题在古代诗文中的各种形态。
+  - **典型例子**：`"【詩】晉傅玄《兩儀詩》曰：兩儀始分．元氣上清．列宿垂象．六位時成..."`
+
+- **《玉海》**
+  - **内容与编排顺序**：详分天文、地理、典章制度等。
+  - **特点**：词条常为长篇大论，注重典章制度、天文地理的详细考证。
+  - **整理问题**：存在词条名拆分或提取不精准的问题（如把“中宫二十八舍”拆断处理）。正文缺乏标点。
+  - **辅助创作价值**：提供精准宏大的制度与天文星象知识，适合创作偏严肃或庙堂题材的诗歌。
+  - **典型例子**：`"中宫": "漢天文志(史天官書同)中宫天極星其一明者泰一之常居也旁三星三公..."`
+
+- **《渊鉴类函》**
+  - **内容与编排顺序**：按大部类排布。
+  - **特点**：清代集大成之作，将引文明确区分为“原”（原类书已有）与“增”（清代新增）。
+  - **整理问题**：使用空格作为句读分隔，未见全角标点。
+  - **辅助创作价值**：覆盖面最广的兜底宝库，适合查阅各种意象的演变和最全面的引文集合。
+  - **典型例子**：`"原釋名曰天坦也坦然髙而逺也 增又曰天顯也在上髙顯也..."`
+
+## 二、韵式
+这类类书专为押韵而生，以“韵母”或“韵字”为一级分类，适合在写格律诗卡壳、需要找特定韵脚词汇时使用。
+
+- **《佩文韵府》**
+  - **内容与编排顺序**：按平水韵分类（如“一东”），下系以该字为尾的各种词条及摘句。
+  - **特点**：非常详细，包含声调、韵部说明，以及海量的带出处短句。以元代《韵府群玉》和明代《五车韵瑞》为基础增补。
+  - **整理问题**：JSON结构层次非常清晰。使用“丨”符号代替原韵字（如“東”被替换为“丨”）。
+  - **辅助创作价值**：**写诗必备神器**。想用“东”韵时，能瞬间获得大量以东结尾的词汇（如“南东”、“活东”）及例句，极大辅助押韵。
+  - **典型例子**：`"（韵母）东": { "小韵描述": "东德红切眷方也...", "词条": { "活東": "爾雅科斗丨丨蝦蟇也...", "牆東": "後漢書避世丨丨王君公..." } }`
+
+- **《韵府群玉》**
+  - **内容与编排顺序**：按大韵分类，列出小韵和具体词条。
+  - **特点**：早期的韵书，条目较为简练紧凑。
+  - **整理问题**：条目内容被尖括号`〈〉`包裹，夹杂部分注音（如“徳紅切”）。
+  - **辅助创作价值**：与佩文韵府同理，但体量更小，适合快速查阅核心的传统押韵典故。
+  - **典型例子**：`"東": { "道東": "〈漢鄭𤣥事馬融辭歸融曰吾道東矣本〉" }`
+
+- **《五车韵瑞》**
+  - **内容与编排顺序**：/
+  - **特点**：/
+  - **整理问题**：**严重问题**，当前文件夹内的 `allorigins.json` 数据获取失败，内容实际上是 Nginx 的 `500 Internal Server Error` 报错网页代码，并非JSON数据。
+  - **辅助创作价值**：暂时无价值。需要修复爬虫和数据源。
+  - **典型例子**：`<html><head><title>500 Internal Server Error</title></head><body>...`
--- a/初学记/generate_json.py
+++ b/初学记/generate_json.py
@@ -1,161 +0,0 @@
-import bs4
-import os
-import re
-import json
-
-html_dir = "epub_extracted/OPS"
-
-def parse_html(filepath):
-    with open(filepath, "r", encoding="utf-8") as f:
-        soup = bs4.BeautifulSoup(f.read(), "html.parser")
-
-    poem_divs = soup.find_all("div", class_="poem")
-    events = []
-    
-    for div in poem_divs:
-        for p in div.find_all("p"):
-            for child in p.children:
-                if child.name == "br":
-                    pass # ignore br, we want continuous text
-                elif child.name == "span" and child.get("id") and child.text.strip().endswith("部"):
-                    events.append(("section", child.text.strip()))
-                elif child.name == "small":
-                    small_text = child.get_text()
-                    if not small_text.startswith("〈") and not small_text.startswith("（"):
-                        events.append(("text", f"（{small_text}）"))
-                    else:
-                        events.append(("text", small_text))
-                elif isinstance(child, str):
-                    events.append(("text", child))
-                else:
-                    events.append(("text", child.get_text()))
-                    
-    return events
-
-
-def extract_sections(content_text):
-    result = {"叙事": "", "事对": "", "诗文": ""}
-
-    rest_after_narrative = content_text
-
-    shìduì_start = rest_after_narrative.find("事對")
-    if shìduì_start == -1:
-        shìduì_start = rest_after_narrative.find("事对")
-
-    genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛|讃|啟|辭|辞|操|對|对)"
-
-    if shìduì_start != -1:
-        result["叙事"] = rest_after_narrative[:shìduì_start]
-        rest_after_shiduì = rest_after_narrative[shìduì_start + 2 :]
-
-        shiwen_match = re.search(genre_pattern, rest_after_shiduì)
-
-        if shiwen_match:
-            split_idx = shiwen_match.start() + 1
-            result["事对"] = rest_after_shiduì[:split_idx]
-            result["诗文"] = rest_after_shiduì[split_idx:]
-        else:
-            result["事对"] = rest_after_shiduì
-    else:
-        shiwen_match = re.search(genre_pattern, rest_after_narrative)
-        if shiwen_match:
-            split_idx = shiwen_match.start() + 1
-            result["叙事"] = rest_after_narrative[:split_idx]
-            result["诗文"] = rest_after_narrative[split_idx:]
-        else:
-            result["叙事"] = rest_after_narrative
-
-    for k in result:
-        result[k] = result[k].replace("〈", "（").replace("〉", "）").strip()
-
-    return result
-
-def main():
-    categories = {}
-    total_entries_found = 0
-    
-    for vol in range(1, 31):
-        filename = None
-        for fn in os.listdir(html_dir):
-            if fn.endswith(f"juan{vol:02d}.xhtml"):
-                filename = fn
-                break
-        
-        if not filename:
-            print(f"Volume {vol} not found")
-            continue
-            
-        events = parse_html(os.path.join(html_dir, filename))
-        
-        merged = []
-        current_section = ""
-        current_text = []
-
-        for ev_type, val in events:
-            if ev_type == "section":
-                if current_text:
-                    merged.append((current_section, "".join(current_text)))
-                    current_text = []
-                current_section = val
-            else:
-                current_text.append(val)
-                
-        if current_text:
-            merged.append((current_section, "".join(current_text)))
-            
-        for sec, txt in merged:
-            matches = list(re.finditer(r"(?:^|[〉）\u3000\s])([^〈〉<>（）\s]+?)(?:第[一二三四五六七八九十百]+(?:[上下])?)?〈(?:叙事|敘事)〉", txt))
-            
-            if not matches:
-                continue
-                
-            for i, m in enumerate(matches):
-                entry_name = m.group(1).strip()
-                start_idx = m.end() # Start of the content AFTER 〈叙事〉
-                # Find the actual start index of the next match
-                # m.start(1) skips the optional prefix character!
-                # Wait! matches[i+1].start() gives the start of the whole match, including the optional prefix.
-                # If we use matches[i+1].start(), we might include the `〉` from the END of this entry inside the next entry's ignored space.
-                # Actually, `matches[i+1].start()` includes the boundary char `〉`.
-                # We should stop at `matches[i+1].start(1) - len(prefix)`?
-                # Actually, if the next entry starts with `〉`, that `〉` BELONGS to the CURRENT entry's poetry!
-                # E.g. `...終負昔賢心〉虹蜺第七〈敘事〉`
-                # The `〉` is the closing bracket for `蘇味道詠虹詩〈...終負昔賢心`.
-                # So we MUST include it! 
-                # `matches[i+1].start(1)` points EXACTLY to `虹`, so `txt[start_idx:matches[i+1].start(1)]` WILL INCLUDE `〉` at the end!
-                # This is EXACTLY what we want!
-                end_idx = matches[i+1].start(1) if i + 1 < len(matches) else len(txt)
-                
-                content_text = txt[start_idx:end_idx]
-                
-                sections = extract_sections(content_text)
-                
-                if entry_name not in categories:
-                    categories[entry_name] = []
-                    
-                categories[entry_name].append({
-                    "volume": vol,
-                    "section": sec,
-                    "content": sections
-                })
-                total_entries_found += 1
-
-    final_json = {
-        "metadata": {
-            "title": "初学记",
-            "author": "徐坚",
-            "dynasty": "唐",
-            "total_volumes": 30,
-            "source": "2026年1月28日从维基文库导出"
-        },
-        "preface": "",
-        "categories": categories
-    }
-
-    with open("初学记.json", "w", encoding="utf-8") as f:
-        json.dump(final_json, f, ensure_ascii=False, indent=2)
-        
-    print(f"Generated 初学记.json with {len(categories)} unique categories and {total_entries_found} total entries.")
-
-if __name__ == "__main__":
-    main()
--- a/初学记/test_extract.py
+++ b/初学记/test_extract.py
@@ -1,24 +0,0 @@
-import bs4
-import re
-import sys
-
-def parse_html(filepath):
-    with open(filepath, "r", encoding="utf-8") as f:
-        soup = bs4.BeautifulSoup(f.read(), "html.parser")
-    
-    current_section = None
-    
-    poem_divs = soup.find_all("div", class_="poem")
-    for div in poem_divs:
-        # Before we process children, let's look at p tags
-        for p in div.find_all("p"):
-            for child in p.children:
-                if child.name == "span" and child.get("id") and child.text.strip().endswith("部"):
-                    current_section = child.text.strip()
-                    print("Found Section:", current_section)
-                elif child.name == "br":
-                    pass
-                elif type(child) == bs4.element.NavigableString:
-                    pass
-
-parse_html("epub_extracted/OPS/c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml")
--- a/初学记/test_extract2.py
+++ b/初学记/test_extract2.py
@@ -1,54 +0,0 @@
-import bs4
-import re
-import os
-
-html_dir = "epub_extracted/OPS"
-
-
-def parse_html(filepath):
-    with open(filepath, "r", encoding="utf-8") as f:
-        soup = bs4.BeautifulSoup(f.read(), "html.parser")
-
-    poem_divs = soup.find_all("div", class_="poem")
-
-    events = []
-
-    for div in poem_divs:
-        for p in div.find_all("p"):
-            current_line = []
-            for child in p.children:
-                if child.name == "br":
-                    if current_line:
-                        events.append(("line", "".join(current_line).strip()))
-                        current_line = []
-                elif (
-                    child.name == "span"
-                    and child.get("id")
-                    and child.text.strip().endswith("部")
-                ):
-                    events.append(("section", child.text.strip()))
-                elif child.name == "small":
-                    # Convert small text to brackets
-                    small_text = child.get_text()
-                    # It might already have brackets inside.
-                    if not small_text.startswith("〈") and not small_text.startswith(
-                        "（"
-                    ):
-                        current_line.append(f"（{small_text}）")
-                    else:
-                        # Sometimes it has pseudo-brackets like `<span style="color:transparent;font-size:0px">〈</span>叙事...`
-                        # Let's clean the text up. We should just take the clean text and if it doesn't have brackets naturally (or we just process it to be clean).
-                        # Let's look at child.get_text()
-                        current_line.append(small_text)
-                else:
-                    current_line.append(child.get_text())
-            if current_line:
-                events.append(("line", "".join(current_line).strip()))
-    return events
-
-
-events = parse_html(
-    os.path.join(html_dir, "c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml")
-)
-for e in events[:30]:
-    print(e)
--- a/初学记/test_extract3.py
+++ b/初学记/test_extract3.py
@@ -1,127 +0,0 @@
-import bs4
-import os
-import re
-
-html_dir = "epub_extracted/OPS"
-
-
-def parse_html(filepath):
-    with open(filepath, "r", encoding="utf-8") as f:
-        soup = bs4.BeautifulSoup(f.read(), "html.parser")
-
-    poem_divs = soup.find_all("div", class_="poem")
-
-    events = []
-    for div in poem_divs:
-        for p in div.find_all("p"):
-            current_line = []
-            for child in p.children:
-                if child.name == "br":
-                    if current_line:
-                        events.append(("line", "".join(current_line).strip()))
-                        current_line = []
-                elif (
-                    child.name == "span"
-                    and child.get("id")
-                    and child.text.strip().endswith("部")
-                ):
-                    events.append(("section", child.text.strip()))
-                elif child.name == "small":
-                    small_text = child.get_text()
-                    if not small_text.startswith("〈") and not small_text.startswith(
-                        "（"
-                    ):
-                        current_line.append(f"〈{small_text}〉")
-                    else:
-                        current_line.append(small_text)
-                else:
-                    current_line.append(child.get_text())
-            if current_line:
-                events.append(("line", "".join(current_line).strip()))
-    return events
-
-
-def extract_sections(entry_text):
-    result = {"叙事": "", "事对": "", "诗文": ""}
-
-    narrative_match = re.search(r"〈(叙事|敘事)〉(.*)", entry_text)
-    if not narrative_match:
-        return result
-
-    rest_after_narrative = narrative_match.group(2)
-
-    shìduì_start = rest_after_narrative.find("事對")
-    if shìduì_start == -1:
-        shìduì_start = rest_after_narrative.find("事对")
-
-    if shìduì_start != -1:
-        result["叙事"] = rest_after_narrative[:shìduì_start]
-        rest_after_shiduì = rest_after_narrative[shìduì_start + 2 :]
-
-        genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛|讃|啟)"
-        shiwen_match = re.search(genre_pattern, rest_after_shiduì)
-
-        if shiwen_match:
-            split_idx = shiwen_match.start() + 1
-            result["事对"] = rest_after_shiduì[:split_idx]
-            result["诗文"] = rest_after_shiduì[split_idx:]
-        else:
-            result["事对"] = rest_after_shiduì
-    else:
-        genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛|讃|啟)"
-        shiwen_match = re.search(genre_pattern, rest_after_narrative)
-        if shiwen_match:
-            split_idx = shiwen_match.start() + 1
-            result["叙事"] = rest_after_narrative[:split_idx]
-            result["诗文"] = rest_after_narrative[split_idx:]
-        else:
-            result["叙事"] = rest_after_narrative
-
-    for k in result:
-        result[k] = result[k].replace("〈", "（").replace("〉", "）")
-
-    return result
-
-
-events = parse_html(
-    os.path.join(html_dir, "c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml")
-)
-
-entries = []
-current_entry_text = []
-
-for ev_type, text in events:
-    if ev_type == "section":
-        pass
-    else:
-        if "〈叙事〉" in text or "〈敘事〉" in text:
-            if current_entry_text:
-                full_text = "".join(current_entry_text)
-                match = re.search(
-                    r"^(.*?)(?:第[一二三四五六七八九十百]+(?:[上下])?)?〈(?:叙事|敘事)〉",
-                    full_text,
-                )
-                if match:
-                    entry_name = match.group(1).strip()
-                    sections = extract_sections(full_text)
-                    entries.append((entry_name, sections))
-            current_entry_text = [text]
-        else:
-            current_entry_text.append(text)
-
-if current_entry_text:
-    full_text = "".join(current_entry_text)
-    match = re.search(
-        r"^(.*?)(?:第[一二三四五六七八九十百]+(?:[上下])?)?〈(?:叙事|敘事)〉", full_text
-    )
-    if match:
-        entry_name = match.group(1).strip()
-        sections = extract_sections(full_text)
-        entries.append((entry_name, sections))
-
-for entry_name, sections in entries[:3]:
-    print("Entry:", entry_name)
-    print("叙事:", sections["叙事"][:50])
-    print("事对:", sections["事对"][:50])
-    print("诗文:", sections["诗文"][:50])
-    print("-" * 20)
--- a/初学记/test_parse.py
+++ b/初学记/test_parse.py
@@ -1,35 +0,0 @@
-import bs4
-import os
-import json
-import re
-
-html_dir = "epub_extracted/OPS"
-
-
-def parse_html(filepath):
-    with open(filepath, "r", encoding="utf-8") as f:
-        soup = bs4.BeautifulSoup(f.read(), "html.parser")
-
-    poem_divs = soup.find_all("div", class_="poem")
-    texts = []
-    for div in poem_divs:
-        for p in div.find_all("p"):
-            # We want to process the text inside <p> while respecting <br/> as separators.
-            # But actually, inside a <p>, there are text nodes, <span>, <small>, <br/>, etc.
-            current_line = []
-            for child in p.children:
-                if child.name == "br":
-                    texts.append("".join(current_line).strip())
-                    current_line = []
-                else:
-                    current_line.append(child.get_text())
-            if current_line:
-                texts.append("".join(current_line).strip())
-    return texts
-
-
-texts = parse_html(
-    os.path.join(html_dir, "c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml")
-)
-for i, t in enumerate(texts[:50]):
-    print(f"{i}: {t}")
--- a/初学记/test_parse2.py
+++ b/初学记/test_parse2.py
@@ -1,93 +0,0 @@
-import bs4
-import os
-import json
-import re
-
-html_dir = "epub_extracted/OPS"
-
-
-def parse_html(filepath):
-    with open(filepath, "r", encoding="utf-8") as f:
-        soup = bs4.BeautifulSoup(f.read(), "html.parser")
-
-    poem_divs = soup.find_all("div", class_="poem")
-    texts = []
-    for div in poem_divs:
-        for p in div.find_all("p"):
-            current_line = []
-            for child in p.children:
-                if child.name == "br":
-                    texts.append("".join(current_line).strip())
-                    current_line = []
-                else:
-                    current_line.append(child.get_text())
-            if current_line:
-                texts.append("".join(current_line).strip())
-    return texts
-
-
-def extract_sections(entry_text):
-    result = {"叙事": "", "事对": "", "诗文": ""}
-
-    # Extract 叙事
-    narrative_match = re.search(r"〈叙事〉(.*)", entry_text)
-    if not narrative_match:
-        return result
-
-    rest_after_narrative = narrative_match.group(1)
-
-    # Find 事对
-    shìduì_start = rest_after_narrative.find("事對")
-    if shìduì_start == -1:
-        shìduì_start = rest_after_narrative.find("事对")
-
-    if shìduì_start != -1:
-        result["叙事"] = rest_after_narrative[:shìduì_start]
-        rest_after_shiduì = rest_after_narrative[shìduì_start + 2 :]  # skip "事對"
-
-        # Find 诗文 start
-        # Match 〉 followed by a literary genre
-        genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛)"
-        shiwen_match = re.search(genre_pattern, rest_after_shiduì)
-
-        if shiwen_match:
-            split_idx = shiwen_match.start() + 1  # keep the genre character
-            result["事对"] = rest_after_shiduì[:split_idx]
-            result["诗文"] = rest_after_shiduì[split_idx:]
-        else:
-            result["事对"] = rest_after_shiduì
-    else:
-        # No 事对
-        genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛)"
-        shiwen_match = re.search(genre_pattern, rest_after_narrative)
-        if shiwen_match:
-            split_idx = shiwen_match.start() + 1
-            result["叙事"] = rest_after_narrative[:split_idx]
-            result["诗文"] = rest_after_narrative[split_idx:]
-        else:
-            result["叙事"] = rest_after_narrative
-
-    return result
-
-
-texts = parse_html(
-    os.path.join(html_dir, "c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml")
-)
-entries = []
-for line in texts:
-    if "〈叙事〉" in line:
-        entry_name = line.split("〈叙事〉")[0]
-        # remove "第X" from entry_name
-        word_name = re.sub(
-            r"第[一二三四五六七八九十百]+(?:[上下])?$", "", entry_name
-        ).strip()
-        sections = extract_sections(line)
-        entries.append((word_name, entry_name, sections))
-
-for e in entries[:3]:
-    print(f"Word: {e[0]}")
-    print(f"Entry: {e[1]}")
-    print(f"叙事 len: {len(e[2]['叙事'])}")
-    print(f"事对 len: {len(e[2]['事对'])}")
-    print(f"诗文 len: {len(e[2]['诗文'])}")
-    print("-" * 20)
--- a/初学记/test_split.py
+++ b/初学记/test_split.py
@@ -1,41 +0,0 @@
-import re
-
-def extract_sections(content_text):
-    result = {"叙事": "", "事对": "", "诗文": ""}
-
-    rest_after_narrative = content_text
-
-    shìduì_start = rest_after_narrative.find("事對")
-    if shìduì_start == -1:
-        shìduì_start = rest_after_narrative.find("事对")
-
-    genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛|讃|啟|辭|辞|操|對|对|銘|論)"
-
-    if shìduì_start != -1:
-        result["叙事"] = rest_after_narrative[:shìduì_start]
-        rest_after_shiduì = rest_after_narrative[shìduì_start + 2 :]
-
-        shiwen_match = re.search(genre_pattern, rest_after_shiduì)
-
-        if shiwen_match:
-            split_idx = shiwen_match.start() + 1
-            result["事对"] = rest_after_shiduì[:split_idx]
-            result["诗文"] = rest_after_shiduì[split_idx:]
-        else:
-            result["事对"] = rest_after_shiduì
-    else:
-        shiwen_match = re.search(genre_pattern, rest_after_narrative)
-        if shiwen_match:
-            split_idx = shiwen_match.start() + 1
-            result["叙事"] = rest_after_narrative[:split_idx]
-            result["诗文"] = rest_after_narrative[split_idx:]
-        else:
-            result["叙事"] = rest_after_narrative
-
-    for k in result:
-        result[k] = result[k].replace("〈", "（").replace("〉", "）")
-
-    return result
-
-import json
-print(extract_sections("這里是叙事事對這裡是事對〉詩這里是詩文"))