spider-ctext/佩文韵府/fix_pipes.py

import json
import re

print("Loading peiwenyunfu.json...")
with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

prefixes = ["韻藻", "增", "増"]

def clean_headword(word):
    clean_word = word
    # Try stripping prefixes
    for _ in range(2): # In case there's "増韻藻" or something
        for p in prefixes:
            if clean_word.startswith(p) and len(clean_word) > len(p):
                clean_word = clean_word[len(p):]
    return clean_word

def replace_pipes_in_content(content, word):
    clean_word = clean_headword(word)
    word_len = len(clean_word)

    if word_len == 0 or "丨" not in content:
        return content

    def repl(match):
        nonlocal pipe_idx
        block = match.group(0)
        block_len = len(block)

        if block_len % word_len == 0:
            # Full word match! Reset alignment.
            pipe_idx = 0
            return clean_word * (block_len // word_len)
        else:
            # Partial word match. Use current sequence.
            res = ""
            for _ in range(block_len):
                res += clean_word[pipe_idx % word_len]
                pipe_idx += 1
            return res

    pipe_idx = 0
    return re.sub(r'丨+', repl, content)

print("Processing...")
for rhyme, r_data in data.items():
    if rhyme in ["metadata", "preface"]:
        continue

    # 1. Fix 小韵描述
    if "小韵描述" in r_data and r_data["小韵描述"]:
        # The placeholder should be replaced by the rhyme char
        # BUT wait! The rhyme char might be simplified in our dictionary keys!
        # The user's prompt used "东" for replacement in 小韵描述.
        # So we just use the dictionary key `rhyme`.
        r_data["小韵描述"] = r_data["小韵描述"].replace("丨", rhyme)

    # 2. Fix 词条
    if "词条" in r_data:
        new_citiao = {}
        for word, content in r_data["词条"].items():
            new_citiao[word] = replace_pipes_in_content(content, word)
        r_data["词条"] = new_citiao

print("Saving peiwenyunfu.json...")
with open('peiwenyunfu.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)
print("Done!")