Using LLM for Multi-level Title Hierarchy Enhancement Example Reference

Basic Concept

Use LLM to re-evaluate the hierarchy of all secondary titles

Model Recommendation

Recommend using R1 or similar reasoning models, other models may not achieve ideal results

Reference Code

python

import re
import requests
import json
import time

# model recommendation: use R1 or similar reasoning models

api_key = ""
model = ""
url = ""

def request_llm(messages: list[dict]) -> str:
    """
    Request LLM and return processed content
    """
    headers = {
        'Authorization': f'Bearer {api_key}',
        'Content-Type': 'application/json'
    }
    data = {
        'model': model,
        'messages': messages
    }
    response = requests.post(url, headers=headers, json=data)
    if response.status_code != 200:
        raise ValueError(f'request llm failed, {response.status_code} {response.text}')
    # print(response.json())
    return response.json()['choices'][0]['message']['content']

def merge_and_fix(markdown_text: str, origin_titles: list[dict], fixed_titles: list[dict]) -> str:
    """
    Merge original text with output titles and apply corrections
    """
    if len(origin_titles) != len(fixed_titles):
        raise ValueError(f'Length mismatch between origin_titles and output_titles, {len(origin_titles)} != {len(fixed_titles)}')
    for origin_title, fixed_title in zip(origin_titles, fixed_titles):
        markdown_text = markdown_text.replace(origin_title['text'], fixed_title['text'])
    return markdown_text

def get_titles_levels_from_markdown(markdown_text: str) -> list[dict]:
    """
    Extract titles from markdown text
    return: list[dict] = [{
        'title': str,
        'level': int # Title level (number of #)
    }]
    """
    res = []
    lines = markdown_text.split('\n')
    for line in lines:
        if line.startswith('#'):
            level = line.count('#')
            title = line.replace('#', '').strip()
            res.append({
                'title': title,
                'level': level,
                'text': line
            })
    return res

def build_messages(title_levels: list[dict]) -> list[dict]:
    """
    Build LLM messages
    """
    messages = [{
        "role": "system",
        "content": "You are an XML title hierarchy correction expert. Based on semantics, adjust title hierarchies to correct levels. If there are content errors in titles, modify them cautiously - only if you're certain, otherwise don't modify. Multiple first-level titles are allowed when appropriate. " +
                   "The <{{id}}> format in the input is for matching results, you must never modify it. " +
                   "Adding, deleting titles, or changing title order is not allowed."
    }]
    user_prompt = 'Input content:\n'
    idx = 0
    for title_level in title_levels:
        user_prompt += f"<{idx}><h{title_level['level']}>{title_level['title']}</h{title_level['level']}>\n"
        idx += 1
    user_prompt += '\nKeep output format consistent, output content:\n'
    messages.append({
        "role": "user",
        "content": user_prompt
    })
    return messages

def read_output(content: str) -> list[dict]:
    """
    Read output content and return titles and levels
    return: list[dict] = [{
        'title': str, # Title without #
        'level': int # Title level (number of #)
        'text': str # Replaced text
    }]
    """
    res = []
    lines = content.split('\n')
    reg = re.compile(r'<(\d+)><h(\d+)>(.*?)</h\d+>')
    idx = 0
    for line in lines:
        if line.strip() == '':
            continue
        match = reg.match(line)
        # check idx
        if int(match.group(1)) != idx:
            raise ValueError(f'idx error, {match.group(1)} != {idx}')
        idx += 1
        if match:
            t = {
                'title': match.group(3),
                'level': int(match.group(2)),
            }
            t['text'] = '#' * t['level'] + ' ' + t['title']
            res.append(t)
    return res

if __name__ == '__main__':
    with open('input.md', 'r', encoding='utf-8') as f:
        content = f.read()
        origin_titles = get_titles_levels_from_markdown(content)
        # with open('origin_titles.txt', 'w', encoding='utf-8') as f2:
        #     c = json.dumps(origin_titles, ensure_ascii=False, indent=4)
        #     f2.write(c)
        messages = build_messages(origin_titles)
        # with open('messages.txt', 'w', encoding='utf-8') as f3:
        #     c = json.dumps(messages, ensure_ascii=False, indent=4)
        #     f3.write(c)
        # start = time.time()
        response = request_llm(messages)
        # with open('llm_response.txt', 'w', encoding='utf-8') as f4:
        #     f4.write(response)
        # end = time.time()
        print(f'llm response time: {end - start}s')
        fixed_titles = read_output(response)
        # with open('fixed_titles.txt', 'w', encoding='utf-8') as f5:
        #     c = json.dumps(fixed_titles, ensure_ascii=False, indent=4)
        #     f5.write(c)
        merged_content = merge_and_fix(content, origin_titles, fixed_titles)
        # with open('output.md', 'w', encoding='utf-8') as f6:
        #     f6.write(merged_content)

Usage Instructions

Configure Parameters: Set LLM interface parameters such as api_key, model, url
Run Script: After executing the script, it will automatically process title hierarchies and output results

Important Notes

Recommend using models with strong reasoning capabilities like R1
The script maintains the original order of titles, only adjusting hierarchies
If there are obvious errors in title content, the LLM will make cautious corrections

Using LLM for Multi-level Title Hierarchy Enhancement Example Reference ​

Basic Concept ​

Model Recommendation ​

Reference Code ​

Usage Instructions ​

Important Notes ​

Using LLM for Multi-level Title Hierarchy Enhancement Example Reference

Basic Concept

Model Recommendation

Reference Code

Usage Instructions

Important Notes