Using LLM for Multi-level Title Hierarchy Enhancement Example Reference
Basic Concept
Use LLM to re-evaluate the hierarchy of all secondary titles
Model Recommendation
Recommend using R1 or similar reasoning models, other models may not achieve ideal results
Reference Code
python
import re
import requests
import json
import time
# model recommendation: use R1 or similar reasoning models
api_key = ""
model = ""
url = ""
def request_llm(messages: list[dict]) -> str:
"""
Request LLM and return processed content
"""
headers = {
'Authorization': f'Bearer {api_key}',
'Content-Type': 'application/json'
}
data = {
'model': model,
'messages': messages
}
response = requests.post(url, headers=headers, json=data)
if response.status_code != 200:
raise ValueError(f'request llm failed, {response.status_code} {response.text}')
# print(response.json())
return response.json()['choices'][0]['message']['content']
def merge_and_fix(markdown_text: str, origin_titles: list[dict], fixed_titles: list[dict]) -> str:
"""
Merge original text with output titles and apply corrections
"""
if len(origin_titles) != len(fixed_titles):
raise ValueError(f'Length mismatch between origin_titles and output_titles, {len(origin_titles)} != {len(fixed_titles)}')
for origin_title, fixed_title in zip(origin_titles, fixed_titles):
markdown_text = markdown_text.replace(origin_title['text'], fixed_title['text'])
return markdown_text
def get_titles_levels_from_markdown(markdown_text: str) -> list[dict]:
"""
Extract titles from markdown text
return: list[dict] = [{
'title': str,
'level': int # Title level (number of #)
}]
"""
res = []
lines = markdown_text.split('\n')
for line in lines:
if line.startswith('#'):
level = line.count('#')
title = line.replace('#', '').strip()
res.append({
'title': title,
'level': level,
'text': line
})
return res
def build_messages(title_levels: list[dict]) -> list[dict]:
"""
Build LLM messages
"""
messages = [{
"role": "system",
"content": "You are an XML title hierarchy correction expert. Based on semantics, adjust title hierarchies to correct levels. If there are content errors in titles, modify them cautiously - only if you're certain, otherwise don't modify. Multiple first-level titles are allowed when appropriate. " +
"The <{{id}}> format in the input is for matching results, you must never modify it. " +
"Adding, deleting titles, or changing title order is not allowed."
}]
user_prompt = 'Input content:\n'
idx = 0
for title_level in title_levels:
user_prompt += f"<{idx}><h{title_level['level']}>{title_level['title']}</h{title_level['level']}>\n"
idx += 1
user_prompt += '\nKeep output format consistent, output content:\n'
messages.append({
"role": "user",
"content": user_prompt
})
return messages
def read_output(content: str) -> list[dict]:
"""
Read output content and return titles and levels
return: list[dict] = [{
'title': str, # Title without #
'level': int # Title level (number of #)
'text': str # Replaced text
}]
"""
res = []
lines = content.split('\n')
reg = re.compile(r'<(\d+)><h(\d+)>(.*?)</h\d+>')
idx = 0
for line in lines:
if line.strip() == '':
continue
match = reg.match(line)
# check idx
if int(match.group(1)) != idx:
raise ValueError(f'idx error, {match.group(1)} != {idx}')
idx += 1
if match:
t = {
'title': match.group(3),
'level': int(match.group(2)),
}
t['text'] = '#' * t['level'] + ' ' + t['title']
res.append(t)
return res
if __name__ == '__main__':
with open('input.md', 'r', encoding='utf-8') as f:
content = f.read()
origin_titles = get_titles_levels_from_markdown(content)
# with open('origin_titles.txt', 'w', encoding='utf-8') as f2:
# c = json.dumps(origin_titles, ensure_ascii=False, indent=4)
# f2.write(c)
messages = build_messages(origin_titles)
# with open('messages.txt', 'w', encoding='utf-8') as f3:
# c = json.dumps(messages, ensure_ascii=False, indent=4)
# f3.write(c)
# start = time.time()
response = request_llm(messages)
# with open('llm_response.txt', 'w', encoding='utf-8') as f4:
# f4.write(response)
# end = time.time()
print(f'llm response time: {end - start}s')
fixed_titles = read_output(response)
# with open('fixed_titles.txt', 'w', encoding='utf-8') as f5:
# c = json.dumps(fixed_titles, ensure_ascii=False, indent=4)
# f5.write(c)
merged_content = merge_and_fix(content, origin_titles, fixed_titles)
# with open('output.md', 'w', encoding='utf-8') as f6:
# f6.write(merged_content)
Usage Instructions
- Configure Parameters: Set LLM interface parameters such as
api_key
,model
,url
- Run Script: After executing the script, it will automatically process title hierarchies and output results
Important Notes
- Recommend using models with strong reasoning capabilities like R1
- The script maintains the original order of titles, only adjusting hierarchies
- If there are obvious errors in title content, the LLM will make cautious corrections