spider-ctext/初学记/generate_json.py

import bs4
import os
import re
import json

html_dir = "epub_extracted/OPS"

def parse_html(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        soup = bs4.BeautifulSoup(f.read(), "html.parser")

    poem_divs = soup.find_all("div", class_="poem")
    events = []

    for div in poem_divs:
        for p in div.find_all("p"):
            for child in p.children:
                if child.name == "br":
                    pass # ignore br, we want continuous text
                elif child.name == "span" and child.get("id") and child.text.strip().endswith("部"):
                    events.append(("section", child.text.strip()))
                elif child.name == "small":
                    small_text = child.get_text()
                    if not small_text.startswith("〈") and not small_text.startswith("（"):
                        events.append(("text", f"（{small_text}）"))
                    else:
                        events.append(("text", small_text))
                elif isinstance(child, str):
                    events.append(("text", child))
                else:
                    events.append(("text", child.get_text()))

    return events


def extract_sections(content_text):
    result = {"叙事": "", "事对": "", "诗文": ""}

    rest_after_narrative = content_text

    shìduì_start = rest_after_narrative.find("事對")
    if shìduì_start == -1:
        shìduì_start = rest_after_narrative.find("事对")

    genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛|讃|啟|辭|辞|操|對|对)"

    if shìduì_start != -1:
        result["叙事"] = rest_after_narrative[:shìduì_start]
        rest_after_shiduì = rest_after_narrative[shìduì_start + 2 :]

        shiwen_match = re.search(genre_pattern, rest_after_shiduì)

        if shiwen_match:
            split_idx = shiwen_match.start() + 1
            result["事对"] = rest_after_shiduì[:split_idx]
            result["诗文"] = rest_after_shiduì[split_idx:]
        else:
            result["事对"] = rest_after_shiduì
    else:
        shiwen_match = re.search(genre_pattern, rest_after_narrative)
        if shiwen_match:
            split_idx = shiwen_match.start() + 1
            result["叙事"] = rest_after_narrative[:split_idx]
            result["诗文"] = rest_after_narrative[split_idx:]
        else:
            result["叙事"] = rest_after_narrative

    for k in result:
        result[k] = result[k].replace("〈", "（").replace("〉", "）").strip()

    return result

def main():
    categories = {}
    total_entries_found = 0

    for vol in range(1, 31):
        filename = None
        for fn in os.listdir(html_dir):
            if fn.endswith(f"juan{vol:02d}.xhtml"):
                filename = fn
                break

        if not filename:
            print(f"Volume {vol} not found")
            continue

        events = parse_html(os.path.join(html_dir, filename))

        merged = []
        current_section = ""
        current_text = []

        for ev_type, val in events:
            if ev_type == "section":
                if current_text:
                    merged.append((current_section, "".join(current_text)))
                    current_text = []
                current_section = val
            else:
                current_text.append(val)

        if current_text:
            merged.append((current_section, "".join(current_text)))

        for sec, txt in merged:
            matches = list(re.finditer(r"(?:^|[〉）\u3000\s])([^〈〉<>（）\s]+?)(?:第[一二三四五六七八九十百]+(?:[上下])?)?〈(?:叙事|敘事)〉", txt))

            if not matches:
                continue

            for i, m in enumerate(matches):
                entry_name = m.group(1).strip()
                start_idx = m.end() # Start of the content AFTER 〈叙事〉
                # Find the actual start index of the next match
                # m.start(1) skips the optional prefix character!
                # Wait! matches[i+1].start() gives the start of the whole match, including the optional prefix.
                # If we use matches[i+1].start(), we might include the `〉` from the END of this entry inside the next entry's ignored space.
                # Actually, `matches[i+1].start()` includes the boundary char `〉`.
                # We should stop at `matches[i+1].start(1) - len(prefix)`?
                # Actually, if the next entry starts with `〉`, that `〉` BELONGS to the CURRENT entry's poetry!
                # E.g. `...終負昔賢心〉虹蜺第七〈敘事〉`
                # The `〉` is the closing bracket for `蘇味道詠虹詩〈...終負昔賢心`.
                # So we MUST include it!
                # `matches[i+1].start(1)` points EXACTLY to `虹`, so `txt[start_idx:matches[i+1].start(1)]` WILL INCLUDE `〉` at the end!
                # This is EXACTLY what we want!
                end_idx = matches[i+1].start(1) if i + 1 < len(matches) else len(txt)

                content_text = txt[start_idx:end_idx]

                sections = extract_sections(content_text)

                if entry_name not in categories:
                    categories[entry_name] = []

                categories[entry_name].append({
                    "volume": vol,
                    "section": sec,
                    "content": sections
                })
                total_entries_found += 1

    final_json = {
        "metadata": {
            "title": "初学记",
            "author": "徐坚",
            "dynasty": "唐",
            "total_volumes": 30,
            "source": "2026年1月28日从维基文库导出"
        },
        "preface": "",
        "categories": categories
    }

    with open("初学记.json", "w", encoding="utf-8") as f:
        json.dump(final_json, f, ensure_ascii=False, indent=2)

    print(f"Generated 初学记.json with {len(categories)} unique categories and {total_entries_found} total entries.")

if __name__ == "__main__":
    main()