import bs4 import os import re import json html_dir = "epub_extracted/OPS" def parse_html(filepath): with open(filepath, "r", encoding="utf-8") as f: soup = bs4.BeautifulSoup(f.read(), "html.parser") poem_divs = soup.find_all("div", class_="poem") events = [] for div in poem_divs: for p in div.find_all("p"): for child in p.children: if child.name == "br": pass # ignore br, we want continuous text elif child.name == "span" and child.get("id") and child.text.strip().endswith("部"): events.append(("section", child.text.strip())) elif child.name == "small": small_text = child.get_text() if not small_text.startswith("〈") and not small_text.startswith("("): events.append(("text", f"({small_text})")) else: events.append(("text", small_text)) elif isinstance(child, str): events.append(("text", child)) else: events.append(("text", child.get_text())) return events def extract_sections(content_text): result = {"叙事": "", "事对": "", "诗文": ""} rest_after_narrative = content_text shìduì_start = rest_after_narrative.find("事對") if shìduì_start == -1: shìduì_start = rest_after_narrative.find("事对") genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛|讃|啟|辭|辞|操|對|对)" if shìduì_start != -1: result["叙事"] = rest_after_narrative[:shìduì_start] rest_after_shiduì = rest_after_narrative[shìduì_start + 2 :] shiwen_match = re.search(genre_pattern, rest_after_shiduì) if shiwen_match: split_idx = shiwen_match.start() + 1 result["事对"] = rest_after_shiduì[:split_idx] result["诗文"] = rest_after_shiduì[split_idx:] else: result["事对"] = rest_after_shiduì else: shiwen_match = re.search(genre_pattern, rest_after_narrative) if shiwen_match: split_idx = shiwen_match.start() + 1 result["叙事"] = rest_after_narrative[:split_idx] result["诗文"] = rest_after_narrative[split_idx:] else: result["叙事"] = rest_after_narrative for k in result: result[k] = result[k].replace("〈", "(").replace("〉", ")").strip() return result def main(): categories = {} total_entries_found = 0 for vol in range(1, 31): filename = None for fn in os.listdir(html_dir): if fn.endswith(f"juan{vol:02d}.xhtml"): filename = fn break if not filename: print(f"Volume {vol} not found") continue events = parse_html(os.path.join(html_dir, filename)) merged = [] current_section = "" current_text = [] for ev_type, val in events: if ev_type == "section": if current_text: merged.append((current_section, "".join(current_text))) current_text = [] current_section = val else: current_text.append(val) if current_text: merged.append((current_section, "".join(current_text))) for sec, txt in merged: matches = list(re.finditer(r"(?:^|[〉)\u3000\s])([^〈〉<>()\s]+?)(?:第[一二三四五六七八九十百]+(?:[上下])?)?〈(?:叙事|敘事)〉", txt)) if not matches: continue for i, m in enumerate(matches): entry_name = m.group(1).strip() start_idx = m.end() # Start of the content AFTER 〈叙事〉 # Find the actual start index of the next match # m.start(1) skips the optional prefix character! # Wait! matches[i+1].start() gives the start of the whole match, including the optional prefix. # If we use matches[i+1].start(), we might include the `〉` from the END of this entry inside the next entry's ignored space. # Actually, `matches[i+1].start()` includes the boundary char `〉`. # We should stop at `matches[i+1].start(1) - len(prefix)`? # Actually, if the next entry starts with `〉`, that `〉` BELONGS to the CURRENT entry's poetry! # E.g. `...終負昔賢心〉虹蜺第七〈敘事〉` # The `〉` is the closing bracket for `蘇味道詠虹詩〈...終負昔賢心`. # So we MUST include it! # `matches[i+1].start(1)` points EXACTLY to `虹`, so `txt[start_idx:matches[i+1].start(1)]` WILL INCLUDE `〉` at the end! # This is EXACTLY what we want! end_idx = matches[i+1].start(1) if i + 1 < len(matches) else len(txt) content_text = txt[start_idx:end_idx] sections = extract_sections(content_text) if entry_name not in categories: categories[entry_name] = [] categories[entry_name].append({ "volume": vol, "section": sec, "content": sections }) total_entries_found += 1 final_json = { "metadata": { "title": "初学记", "author": "徐坚", "dynasty": "唐", "total_volumes": 30, "source": "2026年1月28日从维基文库导出" }, "preface": "", "categories": categories } with open("初学记.json", "w", encoding="utf-8") as f: json.dump(final_json, f, ensure_ascii=False, indent=2) print(f"Generated 初学记.json with {len(categories)} unique categories and {total_entries_found} total entries.") if __name__ == "__main__": main()