spider-ctext/佩文韵府/test_heuristic.py

import json
import re

with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

prefixes = ["韻藻", "增"]

def replace_pipes(content, word):
    clean_word = word
    for p in prefixes:
        if clean_word.startswith(p) and len(clean_word) > len(p):
            clean_word = clean_word[len(p):]
            break # only strip one prefix

    word_len = len(clean_word)
    if word_len == 0:
        return content.replace("丨", "")

    result = []
    pipe_idx = 0
    chars_since_last_pipe = 0

    for char in content:
        if char == "丨":
            if chars_since_last_pipe >= 5:
                # Long gap -> reset pipe_idx!
                # Wait, only reset if we aren't in the middle of a perfect mapping?
                # Actually, if the gap is >=5, it's definitely a new occurrence.
                pipe_idx = 0

            result.append(clean_word[pipe_idx % word_len])
            pipe_idx += 1
            chars_since_last_pipe = 0
        else:
            result.append(char)
            chars_since_last_pipe += 1

    return "".join(result)

# Test specific words
test_cases = [
    ("首陽東", "詩采葑采葑丨丨之丨"),
    ("馬首東", "左傳欒黶曰吾丨丨欲丨乃歸下軍從之"),
    ("澗瀍東", "書洛誥我乃卜丨水東丨水西惟洛食我又卜瀍水丨亦惟洛食"),
    ("日夜東", "丨丨虞集詩絳桃風急丨丨丨王惲詩付與衡漳丨丨丨許有壬詩江水舟"),
    ("東海東", "樓鑰詩萬里逺在丨丨丨張經詩崑崙之西丨"),
]

for w, c in test_cases:
    print(f"Word: {w}")
    print(f"Orig: {c}")
    print(f"Fix : {replace_pipes(c, w)}")
    print("-" * 40)