Update: 删除垃圾程序

2026-03-22 16:43:10 +08:00
parent 183b842090
commit 2881163106
41 changed files with 93 additions and 2046 deletions
--- a/佩文韵府/check_siai.py
+++ b/佩文韵府/check_siai.py
@@ -1,8 +0,0 @@
-import json
-
-with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
-    data = json.load(f)
-
-siai_citiao = data.get("𩅰", {}).get("词条", {})
-for k, v in list(siai_citiao.items())[:20]:
-    print(f"{k}: {v[:30]}...")
--- a/佩文韵府/check_v2.py
+++ b/佩文韵府/check_v2.py
@@ -1,7 +0,0 @@
-import json
-with open('peiwenyunfu_v2.json', 'r', encoding='utf-8') as f:
-    data = json.load(f)
-
-for k in ['𩅰', '桵', '离', '梩']:
-    print(f"Rhyme: {k}")
-    print(json.dumps(data.get(k, {}), ensure_ascii=False, indent=2))
--- a/佩文韵府/download_sequentially.py
+++ b/佩文韵府/download_sequentially.py
@@ -1,71 +0,0 @@
-import os
-import time
-import urllib.request
-import urllib.parse
-from urllib.error import HTTPError
-
-def main():
-    if not os.path.exists("html_files"):
-        os.makedirs("html_files")
-
-    with open("still_missing.txt", "r", encoding="utf-8") as f:
-        urls = [line.strip() for line in f if line.strip()]
-
-    proxy_support = urllib.request.ProxyHandler({'http': 'http://127.0.0.1:10808', 'https': 'http://127.0.0.1:10808'})
-    opener = urllib.request.build_opener(proxy_support)
-    urllib.request.install_opener(opener)
-
-    headers = {
-        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
-        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
-        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
-        'Referer': 'https://zh.wikisource.org/'
-    }
-
-    count = 0
-    total = len(urls)
-    
-    for url in urls:
-        vol = urllib.parse.unquote(url).split('/')[-1]
-        filename = f'html_files/{vol}.html'
-        
-        if os.path.exists(filename):
-            print(f"[{count+1}/{total}] Skipping {vol} (exists)")
-            count += 1
-            continue
-            
-        print(f"[{count+1}/{total}] Downloading {vol}...")
-        
-        success = False
-        for attempt in range(5):
-            try:
-                req = urllib.request.Request(url, headers=headers)
-                response = urllib.request.urlopen(req, timeout=15)
-                html = response.read().decode('utf-8')
-                
-                with open(filename, 'w', encoding='utf-8') as out_f:
-                    out_f.write(html)
-                    
-                print(f"  -> Success!")
-                success = True
-                break
-                
-            except HTTPError as e:
-                if e.code == 403:
-                    print(f"  -> 403 Forbidden. Waiting 5 seconds...")
-                    time.sleep(5)
-                else:
-                    print(f"  -> HTTP Error {e.code}")
-            except Exception as e:
-                print(f"  -> Error: {e}")
-                
-            time.sleep(2)
-            
-        if not success:
-            print(f"  -> Failed all attempts.")
-            
-        count += 1
-        time.sleep(1) # Be nice to server
-
-if __name__ == "__main__":
-    main()
--- a/佩文韵府/extract_links.py
+++ b/佩文韵府/extract_links.py
@@ -1,110 +0,0 @@
-import urllib.request
-from bs4 import BeautifulSoup
-import urllib.parse
-import re
-import os
-
-url = "https://api.allorigins.win/raw?url=https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)"
-
-def chinese_to_arabic(cn_str):
-    cn_num = {
-        "零": 0, "〇": 0, "一": 1, "二": 2, "三": 3, "四": 4,
-        "五": 5, "六": 6, "七": 7, "八": 8, "九": 9, "十": 10,
-        "百": 100, "千": 1000,
-    }
-    result = 0
-    temp = 0
-    for char in cn_str:
-        if char in ["百", "千"]:
-            if temp == 0:
-                temp = 1
-            result += temp * cn_num[char]
-            temp = 0
-        elif char == "十":
-            if temp == 0:
-                temp = 1
-            if len(cn_str) == 1:
-                return 10
-            elif result == 0 and temp == 1 and cn_str[0] == "十":
-                result += 10
-                temp = 0
-            else:
-                result += temp * cn_num[char]
-                temp = 0
-        else:
-            temp = cn_num.get(char, 0)
-    result += temp
-    return result
-
-def get_filename(vol_str):
-    m = re.match(r"卷(.+?)之(.+)", vol_str)
-    if m:
-        v1 = chinese_to_arabic(m.group(1))
-        v2 = chinese_to_arabic(m.group(2))
-        return f"卷{v1:03d}之{v2}.html"
-    m = re.match(r"卷(.+)", vol_str)
-    if m:
-        v1 = chinese_to_arabic(m.group(1))
-        return f"卷{v1:03d}.html"
-    return vol_str + ".html"
-
-def main():
-    try:
-        req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
-        with urllib.request.urlopen(req, timeout=30) as response:
-            html = response.read().decode("utf-8")
-    except Exception as e:
-        print("Failed to fetch:", e)
-        # To avoid failing the test if the network is down but it mocks something else
-        # we can just pass here, but normally a mocked urllib would work.
-        pass
-
-    try:
-        soup = BeautifulSoup(html, "html.parser")
-    except NameError:
-        return
-
-    html_dir = "/mnt/fast/private/denglifan/workspace/spider-ctext/佩文韵府/html_files/"
-    existing_files = set(os.listdir(html_dir)) if os.path.exists(html_dir) else set()
-    
-    missing_urls = []
-    seen_urls = set()
-    base_url = "https://zh.wikisource.org"
-
-    for a in soup.find_all("a"):
-        href = a.get("href")
-        if not href:
-            continue
-        unquoted_href = urllib.parse.unquote(href)
-        
-        # Only include `卷XXX之Y` links (ignore 全览 files)
-        if "御定佩文韻府" in unquoted_href and "四庫全書本" in unquoted_href and "/卷" in unquoted_href:
-            title_part = unquoted_href.split("/")[-1]
-            
-            if "全覽" in title_part or "全览" in title_part:
-                continue
-                
-            # Filter for `卷XXX之Y` pattern if strictly needed.
-            # But let's check regex pattern "卷.+?之.+"
-            if not re.match(r"卷.+?之.+", title_part):
-                continue
-                
-            full_url = urllib.parse.urljoin(base_url, href)
-            full_url = full_url.split('#')[0]
-            
-            if full_url in seen_urls:
-                continue
-            seen_urls.add(full_url)
-            
-            filename = get_filename(title_part)
-            if filename not in existing_files:
-                missing_urls.append(full_url)
-
-    with open("missing_urls.txt", "w", encoding="utf-8") as f:
-        for u in missing_urls:
-            f.write(u + "\n")
-
-    print(f"Found {len(missing_urls)} missing URLs.")
-
-if __name__ == "__main__":
-    main()
--- a/佩文韵府/fix_final.py
+++ b/佩文韵府/fix_final.py
@@ -1,14 +0,0 @@
-import json
-
-with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
-    data = json.load(f)
-
-for rhyme, r_data in data.items():
-    if rhyme in ["metadata", "preface"]: continue
-    if "词条" in r_data:
-        for word, content in r_data["词条"].items():
-            if "丨" in content:
-                r_data["词条"][word] = content.replace("丨", rhyme)
-
-with open('peiwenyunfu.json', 'w', encoding='utf-8') as f:
-    json.dump(data, f, ensure_ascii=False, indent=4)
--- a/佩文韵府/fix_pipes.py
+++ b/佩文韵府/fix_pipes.py
@@ -1,69 +0,0 @@
-import json
-import re
-
-print("Loading peiwenyunfu.json...")
-with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
-    data = json.load(f)
-
-prefixes = ["韻藻", "增", "増"]
-
-def clean_headword(word):
-    clean_word = word
-    # Try stripping prefixes
-    for _ in range(2): # In case there's "増韻藻" or something
-        for p in prefixes:
-            if clean_word.startswith(p) and len(clean_word) > len(p):
-                clean_word = clean_word[len(p):]
-    return clean_word
-
-def replace_pipes_in_content(content, word):
-    clean_word = clean_headword(word)
-    word_len = len(clean_word)
-    
-    if word_len == 0 or "丨" not in content:
-        return content
-
-    def repl(match):
-        nonlocal pipe_idx
-        block = match.group(0)
-        block_len = len(block)
-        
-        if block_len % word_len == 0:
-            # Full word match! Reset alignment.
-            pipe_idx = 0
-            return clean_word * (block_len // word_len)
-        else:
-            # Partial word match. Use current sequence.
-            res = ""
-            for _ in range(block_len):
-                res += clean_word[pipe_idx % word_len]
-                pipe_idx += 1
-            return res
-
-    pipe_idx = 0
-    return re.sub(r'丨+', repl, content)
-
-print("Processing...")
-for rhyme, r_data in data.items():
-    if rhyme in ["metadata", "preface"]:
-        continue
-    
-    # 1. Fix 小韵描述
-    if "小韵描述" in r_data and r_data["小韵描述"]:
-        # The placeholder should be replaced by the rhyme char
-        # BUT wait! The rhyme char might be simplified in our dictionary keys!
-        # The user's prompt used "东" for replacement in 小韵描述. 
-        # So we just use the dictionary key `rhyme`.
-        r_data["小韵描述"] = r_data["小韵描述"].replace("丨", rhyme)
-        
-    # 2. Fix 词条
-    if "词条" in r_data:
-        new_citiao = {}
-        for word, content in r_data["词条"].items():
-            new_citiao[word] = replace_pipes_in_content(content, word)
-        r_data["词条"] = new_citiao
-
-print("Saving peiwenyunfu.json...")
-with open('peiwenyunfu.json', 'w', encoding='utf-8') as f:
-    json.dump(data, f, ensure_ascii=False, indent=4)
-print("Done!")
--- a/佩文韵府/fix_remaining.py
+++ b/佩文韵府/fix_remaining.py
@@ -1,24 +0,0 @@
-import json
-
-print("Loading...")
-with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
-    data = json.load(f)
-
-for rhyme, r_data in data.items():
-    if rhyme in ["metadata", "preface"]: continue
-    
-    if "对语" in r_data and r_data["对语"]:
-        r_data["对语"] = r_data["对语"].replace("丨", rhyme)
-    if "摘句" in r_data and r_data["摘句"]:
-        r_data["摘句"] = r_data["摘句"].replace("丨", rhyme)
-        
-    if "词条" in r_data:
-        new_citiao = {}
-        for word, content in r_data["词条"].items():
-            new_word = word.replace("丨", rhyme)
-            new_citiao[new_word] = content
-        r_data["词条"] = new_citiao
-
-with open('peiwenyunfu.json', 'w', encoding='utf-8') as f:
-    json.dump(data, f, ensure_ascii=False, indent=4)
-print("Done!")
--- a/佩文韵府/fix_structure.py
+++ b/佩文韵府/fix_structure.py
@@ -1,57 +0,0 @@
-import json
-
-print("Loading...")
-with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
-    data = json.load(f)
-
-new_data = {}
-new_data['metadata'] = data['metadata']
-new_data['preface'] = data['preface']
-
-prefixes = ["韻藻", "韵藻", "増", "增"]
-
-def clean_key(k):
-    changed = True
-    while changed:
-        changed = False
-        for p in prefixes:
-            if k.startswith(p) and len(k) > len(p):
-                k = k[len(p):]
-                changed = True
-    return k
-
-for rhyme, r_data in data.items():
-    if rhyme in ['metadata', 'preface']: continue
-    
-    # Store the original main rhyme
-    new_data[rhyme] = {
-        "卷": r_data["卷"],
-        "声": r_data["声"],
-        "韵": r_data["韵"],
-        "小韵描述": r_data["小韵描述"],
-        "韵藻": {},
-        "对语": r_data.get("对语", ""),
-        "摘句": r_data.get("摘句", "")
-    }
-    
-    current_rhyme = rhyme
-    
-    for k, v in r_data.get("词条", {}).items():
-        if len(k) == 1 and any(x in v[:15] for x in ['切', '説文', '廣韻', '玉篇', '集韻', '韻㑹', '音', '同', '釋名', '爾雅']):
-            current_rhyme = k
-            new_data[current_rhyme] = {
-                "卷": r_data["卷"],
-                "声": r_data["声"],
-                "韵": r_data["韵"],
-                "小韵描述": k + v,
-                "韵藻": {},
-                "对语": "",
-                "摘句": ""
-            }
-        else:
-            cleaned = clean_key(k)
-            new_data[current_rhyme]["韵藻"][cleaned] = v
-
-with open('peiwenyunfu_v2.json', 'w', encoding='utf-8') as f:
-    json.dump(new_data, f, ensure_ascii=False, indent=4)
-print(f"Old size: {len(data)}, New size: {len(new_data)}")
--- a/佩文韵府/fix_volume.py
+++ b/佩文韵府/fix_volume.py
@@ -1,28 +0,0 @@
-import json
-
-print("Loading peiwenyunfu.json...")
-with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
-    data = json.load(f)
-
-current_vol = ""
-fixed_count = 0
-
-for rhyme, r_data in data.items():
-    if rhyme in ['metadata', 'preface']: 
-        continue
-        
-    vol = r_data.get("卷", "")
-    if vol.strip():
-        # Update current valid volume
-        current_vol = vol.strip()
-    else:
-        # If volume is empty, use the current_vol
-        if current_vol:
-            r_data["卷"] = current_vol
-            fixed_count += 1
-
-print(f"Fixed {fixed_count} missing volumes.")
-
-with open('peiwenyunfu.json', 'w', encoding='utf-8') as f:
-    json.dump(data, f, ensure_ascii=False, indent=4)
-print("Saved to peiwenyunfu.json!")
--- a/佩文韵府/parse_all.py
+++ b/佩文韵府/parse_all.py
@@ -1,83 +0,0 @@
-import os
-import glob
-import json
-import re
-from parser import parse_html
-
-def natural_sort_key(s):
-    basename = os.path.basename(s)
-    # Match "卷XXX之YY" where XXX and YY can be numbers or Chinese numerals
-    # Actually they are already digits in most cases, e.g., "卷001之1.html" or "卷106之4.html"
-    m = re.search(r"卷(\d+)(?:之(\d+))?", basename)
-    if m:
-        vol = int(m.group(1))
-        sub = int(m.group(2)) if m.group(2) else 0
-        return (1, vol, sub)
-    # fallback
-    return (2, 0, 0)
-
-def main():
-    # Only pick files that start with "卷" to avoid "全覽" duplicate aggregations
-    files = glob.glob('html_files/卷*.html')
-    files.sort(key=natural_sort_key)
-    
-    print(f"Starting to parse {len(files)} files...")
-    
-    combined_result = {}
-    success_count = 0
-    fail_count = 0
-    
-    for idx, fpath in enumerate(files):
-        try:
-            res = parse_html(fpath)
-            for k, v in res.items():
-                # Remove "（韵母）" prefix
-                clean_key = k.replace("（韵母）", "")
-                
-                if clean_key not in combined_result:
-                    combined_result[clean_key] = v
-                else:
-                    # Merge entries
-                    combined_result[clean_key]["词条"].update(v.get("词条", {}))
-                    if "对语" in v and v["对语"]:
-                        combined_result[clean_key]["对语"] += v["对语"]
-                    if "摘句" in v and v["摘句"]:
-                        combined_result[clean_key]["摘句"] += v["摘句"]
-                        
-                    # Also, if the initial file didn't have "卷" properly parsed, update it
-                    if not combined_result[clean_key]["卷"] and v["卷"]:
-                        combined_result[clean_key]["卷"] = v["卷"]
-                    if not combined_result[clean_key]["声"] and v["声"]:
-                        combined_result[clean_key]["声"] = v["声"]
-                    if not combined_result[clean_key]["韵"] and v["韵"]:
-                        combined_result[clean_key]["韵"] = v["韵"]
-                        
-            success_count += 1
-            if idx % 50 == 0:
-                print(f"Parsed {idx}/{len(files)} files...")
-        except Exception as e:
-            print(f"Failed to parse {fpath}: {e}")
-            fail_count += 1
-            
-    print(f"Parsing finished. Success: {success_count}, Failed: {fail_count}")
-    print(f"Total unique rhyme characters extracted: {len(combined_result)}")
-    
-    # Construct final output with metadata
-    final_output = {
-        "metadata": {
-            "title": "御定佩文韵府",
-            "author": "张玉书等",
-            "dynasty": "清",
-            "total_volumes": 106,
-            "source": "2026年3月22日从维基文库导出"
-        },
-        "preface": "",
-        "content": combined_result
-    }
-    
-    with open('peiwenyunfu.json', 'w', encoding='utf-8') as f:
-        json.dump(final_output, f, ensure_ascii=False, indent=4)
-    print("Saved output to peiwenyunfu.json")
-
-if __name__ == "__main__":
-    main()
--- a/佩文韵府/parse_all_v2.py
+++ b/佩文韵府/parse_all_v2.py
@@ -1,79 +0,0 @@
-import os
-import glob
-import json
-import re
-from parser import parse_html
-
-def natural_sort_key(s):
-    basename = os.path.basename(s)
-    # Match "卷XXX之YY"
-    m = re.search(r"卷(\d+)(?:之(\d+))?", basename)
-    if m:
-        vol = int(m.group(1))
-        sub = int(m.group(2)) if m.group(2) else 0
-        return (1, vol, sub)
-    return (2, 0, 0)
-
-def main():
-    files = glob.glob('html_files/卷*.html')
-    files.sort(key=natural_sort_key)
-    
-    print(f"Starting to parse {len(files)} files...")
-    
-    combined_result = {}
-    success_count = 0
-    fail_count = 0
-    
-    for idx, fpath in enumerate(files):
-        try:
-            res = parse_html(fpath)
-            for k, v in res.items():
-                # Remove "（韵母）" prefix
-                clean_key = k.replace("（韵母）", "")
-                
-                if clean_key not in combined_result:
-                    combined_result[clean_key] = v
-                else:
-                    # Merge entries
-                    combined_result[clean_key]["词条"].update(v.get("词条", {}))
-                    if "对语" in v and v["对语"]:
-                        combined_result[clean_key]["对语"] += v["对语"]
-                    if "摘句" in v and v["摘句"]:
-                        combined_result[clean_key]["摘句"] += v["摘句"]
-                        
-                    if not combined_result[clean_key]["卷"] and v["卷"]:
-                        combined_result[clean_key]["卷"] = v["卷"]
-                    if not combined_result[clean_key]["声"] and v["声"]:
-                        combined_result[clean_key]["声"] = v["声"]
-                    if not combined_result[clean_key]["韵"] and v["韵"]:
-                        combined_result[clean_key]["韵"] = v["韵"]
-                        
-            success_count += 1
-            if idx % 50 == 0:
-                print(f"Parsed {idx}/{len(files)} files...")
-        except Exception as e:
-            print(f"Failed to parse {fpath}: {e}")
-            fail_count += 1
-            
-    print(f"Parsing finished. Success: {success_count}, Failed: {fail_count}")
-    print(f"Total unique rhyme characters extracted: {len(combined_result)}")
-    
-    final_output = {
-        "metadata": {
-            "title": "御定佩文韵府",
-            "author": "张玉书等奉敕编",
-            "dynasty": "清",
-            "total_volumes": 106,
-            "source": "2026年3月22日从维基文库导出"
-        },
-        "preface": ""
-    }
-    
-    final_output.update(combined_result)
-    
-    with open('peiwenyunfu.json', 'w', encoding='utf-8') as f:
-        json.dump(final_output, f, ensure_ascii=False, indent=4)
-    print("Saved output to peiwenyunfu.json")
-
-if __name__ == "__main__":
-    main()
--- a/佩文韵府/parse_html_logic.py
+++ b/佩文韵府/parse_html_logic.py
@@ -1,9 +0,0 @@
-def simplify_char(c):
-    mapping = {'東': '东', '銅': '铜', '童': '童'} # add others if needed
-    return mapping.get(c, c)
-
-num_map = {'一':'1', '二':'2', '三':'3', '四':'4', '五':'5', '六':'6', '七':'7', '八':'8', '九':'9'}
-
-# "01之一"
-# If vol_m = re.search(r"卷(.)之(.)", text)
-# "0" + num_map[vol_m.group(1)] + "之" + vol_m.group(2)
--- a/佩文韵府/parser.py
+++ b/佩文韵府/parser.py
@@ -1,124 +0,0 @@
-import re
-from bs4 import BeautifulSoup
-
-def simplify(text):
-    mapping = {
-        '東': '东',
-        '聲': '声',
-        '韻': '韵',
-        '詞': '词',
-        '條': '条',
-        '對': '对',
-        '語': '语',
-        '摘': '摘',
-        '句': '句',
-        '卷': '卷',
-        '紅': '红',
-        '銅': '铜',
-    }
-    for k, v in mapping.items():
-        text = text.replace(k, v)
-    return text
-
-def parse_html(file_path):
-    with open(file_path, "r", encoding="utf-8") as f:
-        soup = BeautifulSoup(f.read(), "html.parser")
-    
-    poem_div = soup.find("div", class_="poem")
-    if not poem_div:
-        return {}
-        
-    text = poem_div.get_text()
-    
-    # Extract Tone and Rhyme
-    tone_rhyme_m = re.search(r"(上平聲|下平聲|上聲|去聲|入聲)\s+(.*?韻.*)", text)
-    current_tone = simplify(tone_rhyme_m.group(1)) if tone_rhyme_m else ""
-    
-    if tone_rhyme_m:
-        raw_rhyme = tone_rhyme_m.group(2).strip()
-        m_rhyme = re.match(r"(.*?)[韻]", raw_rhyme)
-        current_rhyme = simplify(m_rhyme.group(1)) if m_rhyme else simplify(raw_rhyme.replace('韻', ''))
-    else:
-        current_rhyme = ""
-    
-    # Extract Volume
-    vol_m = re.search(r"卷([一二三四五六七八九十]+)之(.+)", text)
-    if vol_m:
-        num_map = {'一':'1', '二':'2', '三':'3', '四':'4', '五':'5', '六':'6', '七':'7', '八':'8', '九':'9', '十':'10'}
-        v1 = vol_m.group(1)
-        v1_digit = "".join(num_map.get(c, c) for c in v1)
-        if len(v1_digit) == 1:
-            v1_str = f"0{v1_digit}"
-        else:
-            v1_str = v1_digit
-        # Fix: the volume match might capture extra text after '之', e.g. '之一\n'
-        v2 = vol_m.group(2).split('\n')[0].strip()
-        current_vol = f"{v1_str}之{v2}"
-    else:
-        current_vol = ""
-        
-    # Extract chars
-    lines = text.split('\n')
-    rhyme_chars = []
-    for i, line in enumerate(lines):
-        if tone_rhyme_m and tone_rhyme_m.group(2) in line:
-            chars_line = lines[i+1]
-            rhyme_chars = [c for c in chars_line.replace('　', ' ').split() if len(c) == 1]
-            break
-            
-    clean_lines = []
-    for line in lines:
-        stripped = line.strip()
-        if not stripped: continue
-        if stripped == "欽定四庫全書": continue
-        if stripped.startswith("御定佩文韻府"): continue
-        if "平聲" in stripped or "上聲" in stripped or "去聲" in stripped or "入聲" in stripped:
-            if "韻" in stripped: continue
-        if all(c in rhyme_chars or c in ' 　' for c in stripped):
-            continue
-        clean_lines.append(stripped)
-
-    clean_text = "".join(clean_lines)
-    
-    tokens = re.findall(r"([^〈〉]*)((?:〈[^〉]+〉)+)", clean_text)
-    
-    result = {}
-    current_char = None
-    
-    for word, desc_blocks in tokens:
-        word = word.strip()
-        desc_content = desc_blocks.replace('〈', '').replace('〉', '')
-        
-        # Is this a main character definition?
-        if word in rhyme_chars:
-            current_char = word
-            simplified_char = simplify(current_char)
-            # Create a new entry in result
-            key = f"（韵母）{simplified_char}"
-            if key not in result:
-                result[key] = {
-                    "卷": current_vol,
-                    "声": current_tone,
-                    "韵": current_rhyme,
-                    "小韵描述": simplify(current_char + desc_content),
-                    "词条": {},
-                    "对语": "",
-                    "摘句": ""
-                }
-        elif word == "對語" or word == "对语":
-            if current_char:
-                key = f"（韵母）{simplify(current_char)}"
-                # Could be multiple parts, though usually one per character block
-                result[key]["对语"] += desc_content
-        elif word == "摘句":
-            if current_char:
-                key = f"（韵母）{simplify(current_char)}"
-                result[key]["摘句"] += desc_content
-        else:
-            # It's a 词条
-            if current_char and word:
-                key = f"（韵母）{simplify(current_char)}"
-                result[key]["词条"][word] = desc_content
-                
-    return result
-
--- a/佩文韵府/robust_download.py
+++ b/佩文韵府/robust_download.py
@@ -1,65 +0,0 @@
-import os
-import time
-import urllib.parse
-import requests
-from requests.exceptions import RequestException
-
-
-def main():
-    if not os.path.exists("html_files"):
-        os.makedirs("html_files")
-
-    with open("missing_urls.txt", "r", encoding="utf-8") as f:
-        urls = [line.strip() for line in f if line.strip()]
-
-    proxies = {"http": "http://127.0.0.1:10808", "https": "http://127.0.0.1:10808"}
-
-    headers = {
-        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
-    }
-
-    for url in urls:
-        # Decode the URL to get the original characters
-        decoded_url = urllib.parse.unquote(url)
-        # Extract the volume name, e.g., 卷001之1
-        volume_name = decoded_url.split("/")[-1]
-
-        filename = f"html_files/{volume_name}.html"
-
-        if os.path.exists(filename):
-            print(f"Skipping {filename}, already exists.")
-            continue
-
-        print(f"Downloading {volume_name} from {url}...")
-
-        success = False
-        retries = 3
-        while not success and retries > 0:
-            try:
-                response = requests.get(
-                    url, headers=headers, proxies=proxies, timeout=15
-                )
-                if response.status_code == 200:
-                    with open(filename, "w", encoding="utf-8") as out_f:
-                        out_f.write(response.text)
-                    success = True
-                    print(f"Successfully downloaded {filename}")
-                else:
-                    print(
-                        f"HTTP Error {response.status_code} for {url}. Retries left: {retries - 1}"
-                    )
-            except RequestException as e:
-                print(f"Error downloading {url}: {e}. Retries left: {retries - 1}")
-
-            if not success:
-                retries -= 1
-                time.sleep(2)  # Delay before retry
-
-        if not success:
-            print(f"Failed to download {url} after all retries.")
-
-        time.sleep(0.5)  # Small delay between requests
-
-
-if __name__ == "__main__":
-    main()
--- a/佩文韵府/robust_download_aio.py
+++ b/佩文韵府/robust_download_aio.py
@@ -1,65 +0,0 @@
-import asyncio
-import aiohttp
-from aiohttp_socks import ProxyConnector, ProxyType
-import os
-import urllib.parse
-import time
-
-
-async def download_file(session, url, filename, semaphore):
-    async with semaphore:
-        if os.path.exists(filename):
-            print(f"Skipping {filename}")
-            return True
-
-        retries = 3
-        while retries > 0:
-            try:
-                async with session.get(url, timeout=30) as response:
-                    if response.status == 200:
-                        content = await response.read()
-                        with open(filename, "wb") as f:
-                            f.write(content)
-                        print(f"Successfully downloaded {filename}")
-                        return True
-                    else:
-                        print(f"HTTP Error {response.status} for {url}")
-            except Exception as e:
-                print(f"Error for {url}: {e}")
-
-            retries -= 1
-            await asyncio.sleep(2)
-
-        print(f"Failed all retries for {url}")
-        return False
-
-
-async def main():
-    if not os.path.exists("html_files"):
-        os.makedirs("html_files")
-
-    with open("missing_urls.txt", "r", encoding="utf-8") as f:
-        urls = [line.strip() for line in f if line.strip()]
-
-    # To satisfy the instruction: SOCKS5 proxy 127.0.0.1:10808
-    # aiohttp_socks uses socks5:// instead of socks5h:// but with rdns=True it is equivalent.
-    proxy_url = "socks5://127.0.0.1:10808"
-    connector = ProxyConnector.from_url(proxy_url, rdns=True)
-
-    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
-
-    semaphore = asyncio.Semaphore(10)  # 10 concurrent downloads
-
-    async with aiohttp.ClientSession(connector=connector, headers=headers) as session:
-        tasks = []
-        for url in urls:
-            decoded_url = urllib.parse.unquote(url)
-            volume_name = decoded_url.split("/")[-1]
-            filename = f"html_files/{volume_name}.html"
-            tasks.append(download_file(session, url, filename, semaphore))
-
-        await asyncio.gather(*tasks)
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
--- a/佩文韵府/robust_download_fast.py
+++ b/佩文韵府/robust_download_fast.py
@@ -1,65 +0,0 @@
-import os
-import time
-import urllib.parse
-import requests
-from requests.exceptions import RequestException
-import concurrent.futures
-
-def download_url(url):
-    proxies = {"http": "http://127.0.0.1:10808", "https": "http://127.0.0.1:10808"}
-    headers = {
-        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
-    }
-    
-    decoded_url = urllib.parse.unquote(url)
-    volume_name = decoded_url.split("/")[-1]
-    filename = f"html_files/{volume_name}.html"
-
-    if os.path.exists(filename):
-        # Already exists
-        return True, filename
-
-    success = False
-    retries = 3
-    while not success and retries > 0:
-        try:
-            response = requests.get(url, headers=headers, proxies=proxies, timeout=15)
-            if response.status_code == 200:
-                with open(filename, "w", encoding="utf-8") as out_f:
-                    out_f.write(response.text)
-                success = True
-                print(f"Successfully downloaded {filename}")
-                return True, filename
-            else:
-                print(f"HTTP Error {response.status_code} for {url}. Retries left: {retries - 1}")
-        except RequestException as e:
-            pass
-        
-        if not success:
-            retries -= 1
-            time.sleep(2)
-
-    if not success:
-        print(f"Failed to download {url} after all retries.")
-        return False, filename
-    
-    return False, filename
-
-def main():
-    if not os.path.exists("html_files"):
-        os.makedirs("html_files")
-
-    with open("missing_urls.txt", "r", encoding="utf-8") as f:
-        urls = [line.strip() for line in f if line.strip()]
-
-    # Use ThreadPoolExecutor to download multiple files concurrently
-    with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
-        futures = [executor.submit(download_url, url) for url in urls]
-        for future in concurrent.futures.as_completed(futures):
-            try:
-                future.result()
-            except Exception as e:
-                print(f"Exception during download: {e}")
-
-if __name__ == "__main__":
-    main()
--- a/佩文韵府/robust_download_final.py
+++ b/佩文韵府/robust_download_final.py
@@ -1,79 +0,0 @@
-import os
-import time
-import urllib.parse
-import requests
-from requests.exceptions import RequestException
-import concurrent.futures
-
-
-def download_url(url):
-    proxies = {"http": "http://127.0.0.1:10808", "https": "http://127.0.0.1:10808"}
-    headers = {
-        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
-        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
-        "Accept-Language": "en-US,en;q=0.9",
-    }
-
-    decoded_url = urllib.parse.unquote(url)
-    volume_name = decoded_url.split("/")[-1]
-    filename = f"html_files/{volume_name}.html"
-
-    if os.path.exists(filename):
-        return True, filename
-
-    success = False
-    retries = 5
-    while not success and retries > 0:
-        try:
-            response = requests.get(url, headers=headers, proxies=proxies, timeout=15)
-            if response.status_code == 200:
-                with open(filename, "w", encoding="utf-8") as out_f:
-                    out_f.write(response.text)
-                success = True
-                print(f"Successfully downloaded {filename}")
-                return True, filename
-            elif response.status_code == 403:
-                print(
-                    f"HTTP Error 403 for {url}. Waiting longer to avoid rate limit..."
-                )
-                time.sleep(5)
-            else:
-                print(
-                    f"HTTP Error {response.status_code} for {url}. Retries left: {retries - 1}"
-                )
-        except RequestException as e:
-            pass
-
-        if not success:
-            retries -= 1
-            time.sleep(3)
-
-    if not success:
-        print(f"Failed to download {url} after all retries.")
-        return False, filename
-
-    return False, filename
-
-
-def main():
-    if not os.path.exists("html_files"):
-        os.makedirs("html_files")
-
-    with open("still_missing.txt", "r", encoding="utf-8") as f:
-        urls = [line.strip() for line in f if line.strip()]
-
-    print(f"Starting download for {len(urls)} remaining files...")
-
-    # Use ThreadPoolExecutor to download multiple files concurrently
-    # Reduced max_workers to 5 to avoid triggering 403 Forbidden rate limits
-    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
-        futures = [executor.submit(download_url, url) for url in urls]
-        for future in concurrent.futures.as_completed(futures):
-            try:
-                future.result()
-            except Exception as e:
-                print(f"Exception during download: {e}")
-
-
-if __name__ == "__main__":
-    main()
--- a/佩文韵府/still_missing.txt
+++ b/佩文韵府/still_missing.txt
@@ -1,96 +0,0 @@
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7070%E4%B9%8B4
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7073%E4%B9%8B1
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7073%E4%B9%8B2
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7073%E4%B9%8B3
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7074%E4%B9%8B4
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7074%E4%B9%8B5
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7077%E4%B9%8B4
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7081%E4%B9%8B1
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7081%E4%B9%8B2
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7081%E4%B9%8B3
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7082%E4%B9%8B4
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7082%E4%B9%8B5
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7083%E4%B9%8B1
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7083%E4%B9%8B2
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7083%E4%B9%8B3
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7083%E4%B9%8B4
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7084%E4%B9%8B1
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7084%E4%B9%8B2
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7085%E4%B9%8B1
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7085%E4%B9%8B2
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7085%E4%B9%8B3
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7085%E4%B9%8B4
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B02
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B03
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B04
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B05
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B06
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B07
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B08
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B09
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B10
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7091%E4%B9%8B1
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7091%E4%B9%8B2
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7091%E4%B9%8B3
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7091%E4%B9%8B4
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7092%E4%B9%8B1
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7092%E4%B9%8B2
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7092%E4%B9%8B4
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7093%E4%B9%8B1
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7093%E4%B9%8B2
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7093%E4%B9%8B6
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7093%E4%B9%8B7
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7093%E4%B9%8B8
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7094%E4%B9%8B1
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7094%E4%B9%8B2
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7095%E4%B9%8B1
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7095%E4%B9%8B2
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7095%E4%B9%8B3
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7095%E4%B9%8B4
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7095%E4%B9%8B5
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7095%E4%B9%8B6
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7096%E4%B9%8B1
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7096%E4%B9%8B2
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7096%E4%B9%8B3
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7098%E4%B9%8B1
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7098%E4%B9%8B2
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7098%E4%B9%8B4
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7098%E4%B9%8B5
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7099%E4%B9%8B2
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7099%E4%B9%8B3
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7099%E4%B9%8B5
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7099%E4%B9%8B7
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7099%E4%B9%8B8
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7099%E4%B9%8B9
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7099%E4%B9%8B010
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B1
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B2
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B3
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B4
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B6
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B7
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B8
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B9
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B010
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B011
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B012
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7101%E4%B9%8B1
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7101%E4%B9%8B2
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7101%E4%B9%8B3
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7101%E4%B9%8B4
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7102%E4%B9%8B1
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7102%E4%B9%8B2
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7102%E4%B9%8B3
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7102%E4%B9%8B4
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7102%E4%B9%8B5
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7102%E4%B9%8B6
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7102%E4%B9%8B7
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7102%E4%B9%8B8
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7103%E4%B9%8B1
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7103%E4%B9%8B2
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7104%E4%B9%8B1
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7104%E4%B9%8B2
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7105%E4%B9%8B1
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7105%E4%B9%8B2
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7105%E4%B9%8B3
-https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7106%E4%B9%8B2
--- a/佩文韵府/test_clean.py
+++ b/佩文韵府/test_clean.py
@@ -1,53 +0,0 @@
-import re
-from bs4 import BeautifulSoup
-
-with open("html_files/卷001之1.html", "r", encoding="utf-8") as f:
-    soup = BeautifulSoup(f.read(), "html.parser")
-
-text = soup.find("div", class_="poem").get_text()
-
-# Extract Tone and Rhyme
-tone_rhyme_m = re.search(r"(上平聲|下平聲|上聲|去聲|入聲)\s+(.*?韻.*)", text)
-if tone_rhyme_m:
-    print("Tone:", tone_rhyme_m.group(1))
-    print("Rhyme:", tone_rhyme_m.group(2))
-
-# Extract Volume
-vol_m = re.search(r"卷(.)之(.)", text)
-if vol_m:
-    print("Volume:", f"0{vol_m.group(1)}之{vol_m.group(2)}" if vol_m.group(1) in '一二三四五六七八九' else f"{vol_m.group(1)}之{vol_m.group(2)}")
-
-# Extract chars
-lines = text.split('\n')
-rhyme_chars = []
-for i, line in enumerate(lines):
-    if tone_rhyme_m.group(2) in line:
-        # the next line usually has the characters
-        chars_line = lines[i+1]
-        rhyme_chars = [c for c in chars_line.replace('　', ' ').split() if len(c) == 1]
-        break
-
-print("Chars:", rhyme_chars)
-
-# Now, we want to strip all lines that are headers.
-# Actually, headers repeat: "欽定四庫全書", "御定佩文韻府...", "上平聲..."
-# We can just filter out these known header lines!
-clean_lines = []
-for line in lines:
-    stripped = line.strip()
-    if not stripped: continue
-    if stripped == "欽定四庫全書": continue
-    if stripped.startswith("御定佩文韻府"): continue
-    if "平聲" in stripped or "上聲" in stripped or "去聲" in stripped or "入聲" in stripped:
-        if "韻" in stripped: continue
-    # Is it the chars line?
-    if all(c in rhyme_chars or c in ' 　' for c in stripped):
-        continue
-    clean_lines.append(stripped)
-
-clean_text = "".join(clean_lines)
-print("Start of clean text:", clean_text[:100])
-
-tokens = re.findall(r"([^〈〉]*)((?:〈[^〉]+〉)+)", clean_text)
-print("First token:", tokens[0])
-
--- a/佩文韵府/test_dump.py
+++ b/佩文韵府/test_dump.py
@@ -1,10 +0,0 @@
-from bs4 import BeautifulSoup
-
-with open("html_files/卷001之1.html", "r", encoding="utf-8") as f:
-    soup = BeautifulSoup(f.read(), "html.parser")
-
-poem_div = soup.find("div", class_="poem")
-if poem_div:
-    for i, p in enumerate(poem_div.find_all("p")[:20]):
-        print(f"--- P {i} ---")
-        print(p.text[:100].replace('\n', ' '))
--- a/佩文韵府/test_dump2.py
+++ b/佩文韵府/test_dump2.py
@@ -1,12 +0,0 @@
-import re
-from bs4 import BeautifulSoup
-
-with open("html_files/卷001之1.html", "r", encoding="utf-8") as f:
-    soup = BeautifulSoup(f.read(), "html.parser")
-
-poem_div = soup.find("div", class_="poem")
-if poem_div:
-    lines = poem_div.get_text().split("\n")
-    lines = [line.strip() for line in lines if line.strip()]
-    for i, line in enumerate(lines[:50]):
-        print(f"{i}: {line}")
--- a/佩文韵府/test_empty_word.py
+++ b/佩文韵府/test_empty_word.py
@@ -1,5 +0,0 @@
-import re
-text = "對語〈渭北　江東〉〈　平北　安東〉摘句〈力障百川東〉"
-tokens = re.findall(r"([^〈〉]*)((?:〈[^〉]+〉)+)", text)
-for i, (word, desc_blocks) in enumerate(tokens):
-    print(f"Token {i}: WORD='{word}' DESCS={desc_blocks}")
--- a/佩文韵府/test_heuristic.py
+++ b/佩文韵府/test_heuristic.py
@@ -1,54 +0,0 @@
-import json
-import re
-
-with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
-    data = json.load(f)
-
-prefixes = ["韻藻", "增"]
-
-def replace_pipes(content, word):
-    clean_word = word
-    for p in prefixes:
-        if clean_word.startswith(p) and len(clean_word) > len(p):
-            clean_word = clean_word[len(p):]
-            break # only strip one prefix
-            
-    word_len = len(clean_word)
-    if word_len == 0:
-        return content.replace("丨", "")
-        
-    result = []
-    pipe_idx = 0
-    chars_since_last_pipe = 0
-    
-    for char in content:
-        if char == "丨":
-            if chars_since_last_pipe >= 5:
-                # Long gap -> reset pipe_idx!
-                # Wait, only reset if we aren't in the middle of a perfect mapping?
-                # Actually, if the gap is >=5, it's definitely a new occurrence.
-                pipe_idx = 0
-            
-            result.append(clean_word[pipe_idx % word_len])
-            pipe_idx += 1
-            chars_since_last_pipe = 0
-        else:
-            result.append(char)
-            chars_since_last_pipe += 1
-            
-    return "".join(result)
-
-# Test specific words
-test_cases = [
-    ("首陽東", "詩采葑采葑丨丨之丨"),
-    ("馬首東", "左傳欒黶曰吾丨丨欲丨乃歸下軍從之"),
-    ("澗瀍東", "書洛誥我乃卜丨水東丨水西惟洛食我又卜瀍水丨亦惟洛食"),
-    ("日夜東", "丨丨虞集詩絳桃風急丨丨丨王惲詩付與衡漳丨丨丨許有壬詩江水舟"),
-    ("東海東", "樓鑰詩萬里逺在丨丨丨張經詩崑崙之西丨"),
-]
-
-for w, c in test_cases:
-    print(f"Word: {w}")
-    print(f"Orig: {c}")
-    print(f"Fix : {replace_pipes(c, w)}")
-    print("-" * 40)
--- a/佩文韵府/test_heuristic2.py
+++ b/佩文韵府/test_heuristic2.py
@@ -1,37 +0,0 @@
-import json
-
-prefixes = ["韻藻", "增"]
-
-def replace_pipes_no_reset(content, word):
-    clean_word = word
-    for p in prefixes:
-        if clean_word.startswith(p) and len(clean_word) > len(p):
-            clean_word = clean_word[len(p):]
-            break
-            
-    word_len = len(clean_word)
-    if word_len == 0:
-        return content.replace("丨", "")
-        
-    result = []
-    pipe_idx = 0
-    for char in content:
-        if char == "丨":
-            result.append(clean_word[pipe_idx % word_len])
-            pipe_idx += 1
-        else:
-            result.append(char)
-    return "".join(result)
-
-test_cases = [
-    ("首陽東", "詩采葑采葑丨丨之丨"),
-    ("馬首東", "左傳欒黶曰吾丨丨欲丨乃歸下軍從之"),
-    ("澗瀍東", "書洛誥我乃卜丨水東丨水西惟洛食我又卜瀍水丨亦惟洛食"),
-    ("日夜東", "丨丨虞集詩絳桃風急丨丨丨王惲詩付與衡漳丨丨丨許有壬詩江水舟"),
-    ("東海東", "樓鑰詩萬里逺在丨丨丨張經詩崑崙之西丨"),
-]
-
-for w, c in test_cases:
-    print(f"Word: {w}")
-    print(f"NoRst: {replace_pipes_no_reset(c, w)}")
-    print("-" * 40)
--- a/佩文韵府/test_hybrid.py
+++ b/佩文韵府/test_hybrid.py
@@ -1,56 +0,0 @@
-import json
-import re
-
-prefixes = ["韻藻", "增"]
-
-def replace_pipes_hybrid(content, word):
-    clean_word = word
-    for p in prefixes:
-        if clean_word.startswith(p) and len(clean_word) > len(p):
-            clean_word = clean_word[len(p):]
-            break
-            
-    word_len = len(clean_word)
-    if word_len == 0:
-        return content.replace("丨", "")
-        
-    def repl(match):
-        nonlocal pipe_idx
-        block = match.group(0)
-        block_len = len(block)
-        
-        if block_len % word_len == 0:
-            # Full word match! Reset alignment.
-            pipe_idx = 0
-            return clean_word * (block_len // word_len)
-        else:
-            # Partial word match. Use current sequence.
-            res = ""
-            for _ in range(block_len):
-                res += clean_word[pipe_idx % word_len]
-                pipe_idx += 1
-            return res
-
-    pipe_idx = 0
-    return re.sub(r'丨+', repl, content)
-
-test_cases = [
-    ("首陽東", "詩采葑采葑丨丨之丨"),
-    ("馬首東", "左傳欒黶曰吾丨丨欲丨乃歸下軍從之"),
-    ("澗瀍東", "書洛誥我乃卜丨水東丨水西惟洛食我又卜瀍水丨亦惟洛食"),
-    ("日夜東", "丨丨虞集詩絳桃風急丨丨丨王惲詩付與衡漳丨丨丨許有壬詩江水舟"),
-    ("東海東", "樓鑰詩萬里逺在丨丨丨張經詩崑崙之西丨"),
-]
-
-for w, c in test_cases:
-    print(f"Word: {w}")
-    print(f"Orig: {c}")
-    print(f"Hybr: {replace_pipes_hybrid(c, w)}")
-    print("-" * 40)
-test_cases.append(("紫殿東", "時魚躍丨丨丨温庭筠詩一夕丨丨"))
-
-for w, c in test_cases[-1:]:
-    print(f"Word: {w}")
-    print(f"Orig: {c}")
-    print(f"Hybr: {replace_pipes_hybrid(c, w)}")
-    print("-" * 40)
--- a/佩文韵府/test_parser.py
+++ b/佩文韵府/test_parser.py
@@ -1,31 +0,0 @@
-import json
-from parser import parse_html
-
-def test_parse_html():
-    file_path = "html_files/卷001之1.html"
-    result = parse_html(file_path)
-    
-    # Save for manual inspection
-    with open("output.json", "w", encoding="utf-8") as f:
-        json.dump(result, f, ensure_ascii=False, indent=2)
-
-    # Check that it returns a dictionary
-    assert isinstance(result, dict)
-    
-    # Let's see what keys are in it
-    keys = list(result.keys())
-    print("Keys found:", keys)
-    
-    if len(keys) > 0:
-        first_key = keys[0]
-        assert "卷" in result[first_key]
-        assert "声" in result[first_key]
-        assert "韵" in result[first_key]
-        assert "小韵描述" in result[first_key]
-        assert "词条" in result[first_key]
-        assert "对语" in result[first_key]
-        assert "摘句" in result[first_key]
-
-if __name__ == "__main__":
-    test_parse_html()
-    print("Tests passed!")
--- a/佩文韵府/test_placeholder.py
+++ b/佩文韵府/test_placeholder.py
@@ -1,49 +0,0 @@
-import json
-import re
-
-with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
-    data = json.load(f)
-
-# Analyze a few
-count_match = 0
-count_mismatch = 0
-
-for rhyme, r_data in list(data.items())[:5]: # Skip metadata, preface
-    if rhyme in ["metadata", "preface"]:
-        continue
-    print(f"\nRhyme: {rhyme}")
-    
-    # 1. 小韵描述
-    desc = r_data.get("小韵描述", "")
-    desc_fixed = desc.replace("丨", rhyme)
-    print(f"Desc original: {desc[:30]}...")
-    print(f"Desc fixed:    {desc_fixed[:30]}...")
-    
-    # 2. 词条
-    for word, content in list(r_data.get("词条", {}).items())[:5]:
-        pipe_count = content.count("丨")
-        word_len = len(word)
-        if pipe_count == 0:
-            continue
-        
-        print(f"Word: {word} (len {word_len}), pipes: {pipe_count}")
-        print(f"Original: {content}")
-        
-        # Test replacing
-        if pipe_count % word_len == 0:
-            # We can replace them in groups of word_len
-            fixed_content = ""
-            pipe_idx = 0
-            for char in content:
-                if char == "丨":
-                    fixed_content += word[pipe_idx % word_len]
-                    pipe_idx += 1
-                else:
-                    fixed_content += char
-            print(f"Fixed:    {fixed_content}")
-            count_match += 1
-        else:
-            print("MISMATCH length!")
-            count_mismatch += 1
-
-print(f"\nMatches: {count_match}, Mismatches: {count_mismatch}")
--- a/佩文韵府/test_playwright.py
+++ b/佩文韵府/test_playwright.py
@@ -1,46 +0,0 @@
-import asyncio
-from playwright.async_api import async_playwright
-import urllib.parse
-import sys
-
-
-async def main():
-    url = sys.argv[1]
-    proxy_server = "socks5://127.0.0.1:10808"
-
-    async with async_playwright() as p:
-        # Try with proxy
-        browser = await p.chromium.launch(headless=True, proxy={"server": proxy_server})
-        context = await browser.new_context(
-            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
-        )
-        page = await context.new_page()
-
-        try:
-            print(f"Loading {url} via proxy...")
-            await page.goto(url, timeout=30000, wait_until="domcontentloaded")
-            content = await page.content()
-            print(f"Success! Content length: {len(content)}")
-        except Exception as e:
-            print(f"Error via proxy: {e}")
-            await browser.close()
-
-            # Try without proxy
-            print("Retrying without proxy...")
-            browser = await p.chromium.launch(headless=True)
-            context = await browser.new_context(
-                user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
-            )
-            page = await context.new_page()
-            try:
-                await page.goto(url, timeout=30000, wait_until="domcontentloaded")
-                content = await page.content()
-                print(f"Success! Content length: {len(content)}")
-            except Exception as e2:
-                print(f"Error without proxy: {e2}")
-
-        await browser.close()
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
--- a/佩文韵府/test_prefixes.py
+++ b/佩文韵府/test_prefixes.py
@@ -1,21 +0,0 @@
-import json
-
-with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
-    data = json.load(f)
-
-mismatches = []
-prefixes_found = set()
-
-for rhyme, r_data in data.items():
-    if rhyme in ["metadata", "preface"]:
-        continue
-    for word, content in r_data.get("词条", {}).items():
-        pipe_count = content.count("丨")
-        if pipe_count == 0:
-            continue
-        if pipe_count % len(word) != 0:
-            mismatches.append((word, pipe_count))
-
-print(f"Total mismatches: {len(mismatches)}")
-for w, p in mismatches[:20]:
-    print(f"{w} (len {len(w)}), pipes: {p}")
--- a/佩文韵府/test_specific.py
+++ b/佩文韵府/test_specific.py
@@ -1,13 +0,0 @@
-import json
-
-with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
-    data = json.load(f)
-
-for rhyme, r_data in data.items():
-    if rhyme in ["metadata", "preface"]:
-        continue
-    for word, content in r_data.get("词条", {}).items():
-        if word in ["紫殿東", "少微東", "東海東", "日夜東", "隔西東"]:
-            print(f"Word: {word}")
-            print(f"Content: {content}")
-            print("-" * 40)
--- a/佩文韵府/test_split.py
+++ b/佩文韵府/test_split.py
@@ -1,18 +0,0 @@
-import re
-from bs4 import BeautifulSoup
-
-with open("html_files/卷001之1.html", "r", encoding="utf-8") as f:
-    soup = BeautifulSoup(f.read(), "html.parser")
-
-poem_div = soup.find("div", class_="poem")
-text = poem_div.get_text()
-
-# Extract the list of characters.
-# It appears after "一東韻一" or similar.
-m = re.search(r"(上平聲|下平聲|上聲|去聲|入聲)\s+(.*?韻.*?)\n(.*?)\n", text)
-if m:
-    print("Tone:", m.group(1))
-    print("Rhyme:", m.group(2))
-    print("Chars line:", m.group(3))
-    rhyme_chars = [c for c in m.group(3).replace('　', ' ').split() if len(c) == 1]
-    print("Chars:", rhyme_chars)
--- a/佩文韵府/test_strip.py
+++ b/佩文韵府/test_strip.py
@@ -1,32 +0,0 @@
-import json
-import re
-
-with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
-    data = json.load(f)
-
-prefixes = ["韻藻", "增"]
-
-mismatches = []
-total_pipes = 0
-
-for rhyme, r_data in data.items():
-    if rhyme in ["metadata", "preface"]:
-        continue
-    for word, content in r_data.get("词条", {}).items():
-        clean_word = word
-        for p in prefixes:
-            if clean_word.startswith(p) and len(clean_word) > len(p):
-                clean_word = clean_word[len(p):]
-                
-        pipe_count = content.count("丨")
-        if pipe_count == 0:
-            continue
-        total_pipes += 1
-        if pipe_count % len(clean_word) != 0:
-            mismatches.append((word, clean_word, pipe_count, content))
-
-print(f"Total entries with pipes: {total_pipes}")
-print(f"Total mismatches after stripping: {len(mismatches)}")
-for w, cw, p, c in mismatches[:10]:
-    print(f"{w} -> {cw} (len {len(cw)}), pipes: {p}")
-    print(f"  {c}")
--- a/佩文韵府/test_tokenize.py
+++ b/佩文韵府/test_tokenize.py
@@ -1,21 +0,0 @@
-import re
-from bs4 import BeautifulSoup
-
-with open("html_files/卷001之1.html", "r", encoding="utf-8") as f:
-    soup = BeautifulSoup(f.read(), "html.parser")
-
-text = soup.find("div", class_="poem").get_text().replace('\n', '')
-
-# Remove header junk for this test (just find the first '東〈')
-start_idx = text.find('東〈')
-text = text[start_idx:]
-
-# Tokenize into pairs of (Word, Description)
-# Using regex to find all Word〈Description〉
-# Wait, multiple 〈 〉 can follow a word like 對語〈...〉〈...〉
-# We can find all chunks of non-〈 characters, followed by one or more 〈...〉
-tokens = re.findall(r"([^〈〉]*)((?:〈[^〉]+〉)+)", text)
-
-for i, (word, desc_blocks) in enumerate(tokens[:20]):
-    print(f"Token {i}: WORD='{word}' DESCS={desc_blocks[:30]}...")
-