Update: 初学记、佩文韵府 and 五车韵瑞

2026-03-22 16:18:35 +08:00
parent df475fd03f
commit 183b842090
553 changed files with 754048 additions and 169 deletions
--- a/初学记/generate_json.py
+++ b/初学记/generate_json.py
@@ -0,0 +1,161 @@
+import bs4
+import os
+import re
+import json
+
+html_dir = "epub_extracted/OPS"
+
+def parse_html(filepath):
+    with open(filepath, "r", encoding="utf-8") as f:
+        soup = bs4.BeautifulSoup(f.read(), "html.parser")
+
+    poem_divs = soup.find_all("div", class_="poem")
+    events = []
+    
+    for div in poem_divs:
+        for p in div.find_all("p"):
+            for child in p.children:
+                if child.name == "br":
+                    pass # ignore br, we want continuous text
+                elif child.name == "span" and child.get("id") and child.text.strip().endswith("部"):
+                    events.append(("section", child.text.strip()))
+                elif child.name == "small":
+                    small_text = child.get_text()
+                    if not small_text.startswith("〈") and not small_text.startswith("（"):
+                        events.append(("text", f"（{small_text}）"))
+                    else:
+                        events.append(("text", small_text))
+                elif isinstance(child, str):
+                    events.append(("text", child))
+                else:
+                    events.append(("text", child.get_text()))
+                    
+    return events
+
+
+def extract_sections(content_text):
+    result = {"叙事": "", "事对": "", "诗文": ""}
+
+    rest_after_narrative = content_text
+
+    shìduì_start = rest_after_narrative.find("事對")
+    if shìduì_start == -1:
+        shìduì_start = rest_after_narrative.find("事对")
+
+    genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛|讃|啟|辭|辞|操|對|对)"
+
+    if shìduì_start != -1:
+        result["叙事"] = rest_after_narrative[:shìduì_start]
+        rest_after_shiduì = rest_after_narrative[shìduì_start + 2 :]
+
+        shiwen_match = re.search(genre_pattern, rest_after_shiduì)
+
+        if shiwen_match:
+            split_idx = shiwen_match.start() + 1
+            result["事对"] = rest_after_shiduì[:split_idx]
+            result["诗文"] = rest_after_shiduì[split_idx:]
+        else:
+            result["事对"] = rest_after_shiduì
+    else:
+        shiwen_match = re.search(genre_pattern, rest_after_narrative)
+        if shiwen_match:
+            split_idx = shiwen_match.start() + 1
+            result["叙事"] = rest_after_narrative[:split_idx]
+            result["诗文"] = rest_after_narrative[split_idx:]
+        else:
+            result["叙事"] = rest_after_narrative
+
+    for k in result:
+        result[k] = result[k].replace("〈", "（").replace("〉", "）").strip()
+
+    return result
+
+def main():
+    categories = {}
+    total_entries_found = 0
+    
+    for vol in range(1, 31):
+        filename = None
+        for fn in os.listdir(html_dir):
+            if fn.endswith(f"juan{vol:02d}.xhtml"):
+                filename = fn
+                break
+        
+        if not filename:
+            print(f"Volume {vol} not found")
+            continue
+            
+        events = parse_html(os.path.join(html_dir, filename))
+        
+        merged = []
+        current_section = ""
+        current_text = []
+
+        for ev_type, val in events:
+            if ev_type == "section":
+                if current_text:
+                    merged.append((current_section, "".join(current_text)))
+                    current_text = []
+                current_section = val
+            else:
+                current_text.append(val)
+                
+        if current_text:
+            merged.append((current_section, "".join(current_text)))
+            
+        for sec, txt in merged:
+            matches = list(re.finditer(r"(?:^|[〉）\u3000\s])([^〈〉<>（）\s]+?)(?:第[一二三四五六七八九十百]+(?:[上下])?)?〈(?:叙事|敘事)〉", txt))
+            
+            if not matches:
+                continue
+                
+            for i, m in enumerate(matches):
+                entry_name = m.group(1).strip()
+                start_idx = m.end() # Start of the content AFTER 〈叙事〉
+                # Find the actual start index of the next match
+                # m.start(1) skips the optional prefix character!
+                # Wait! matches[i+1].start() gives the start of the whole match, including the optional prefix.
+                # If we use matches[i+1].start(), we might include the `〉` from the END of this entry inside the next entry's ignored space.
+                # Actually, `matches[i+1].start()` includes the boundary char `〉`.
+                # We should stop at `matches[i+1].start(1) - len(prefix)`?
+                # Actually, if the next entry starts with `〉`, that `〉` BELONGS to the CURRENT entry's poetry!
+                # E.g. `...終負昔賢心〉虹蜺第七〈敘事〉`
+                # The `〉` is the closing bracket for `蘇味道詠虹詩〈...終負昔賢心`.
+                # So we MUST include it! 
+                # `matches[i+1].start(1)` points EXACTLY to `虹`, so `txt[start_idx:matches[i+1].start(1)]` WILL INCLUDE `〉` at the end!
+                # This is EXACTLY what we want!
+                end_idx = matches[i+1].start(1) if i + 1 < len(matches) else len(txt)
+                
+                content_text = txt[start_idx:end_idx]
+                
+                sections = extract_sections(content_text)
+                
+                if entry_name not in categories:
+                    categories[entry_name] = []
+                    
+                categories[entry_name].append({
+                    "volume": vol,
+                    "section": sec,
+                    "content": sections
+                })
+                total_entries_found += 1
+
+    final_json = {
+        "metadata": {
+            "title": "初学记",
+            "author": "徐坚",
+            "dynasty": "唐",
+            "total_volumes": 30,
+            "source": "2026年1月28日从维基文库导出"
+        },
+        "preface": "",
+        "categories": categories
+    }
+
+    with open("初学记.json", "w", encoding="utf-8") as f:
+        json.dump(final_json, f, ensure_ascii=False, indent=2)
+        
+    print(f"Generated 初学记.json with {len(categories)} unique categories and {total_entries_found} total entries.")
+
+if __name__ == "__main__":
+    main()