Still broken. Trying to make the update procedure to make sense.
This commit is contained in:
parent
865b0d225d
commit
3fe7f52066
@ -13,7 +13,7 @@ import hashlib
|
|||||||
import argparse
|
import argparse
|
||||||
import logging
|
import logging
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import List, Tuple, Optional
|
from typing import List, Tuple, Dict, Set, Optional
|
||||||
from markdown_it import MarkdownIt
|
from markdown_it import MarkdownIt
|
||||||
from thefuzz import fuzz, process
|
from thefuzz import fuzz, process
|
||||||
|
|
||||||
@ -137,108 +137,188 @@ class DocumentManager:
|
|||||||
|
|
||||||
class MarkdownProcessor:
|
class MarkdownProcessor:
|
||||||
"""Processes markdown files and stores content in the database."""
|
"""Processes markdown files and stores content in the database."""
|
||||||
|
|
||||||
def __init__(self, db_manager: 'DatabaseManager') -> None:
|
def __init__(self, db_manager: 'DatabaseManager') -> None:
|
||||||
"""Initialize the MarkdownProcessor."""
|
|
||||||
self.db_manager = db_manager
|
self.db_manager = db_manager
|
||||||
|
|
||||||
def process_markdown(self, markdown_file: str, document_id: int) -> None:
|
def process_markdown(self, markdown_file: str, document_id: int) -> None:
|
||||||
"""Process a markdown file and store its content in the database."""
|
|
||||||
markdown_text = self.read_markdown_file(markdown_file)
|
markdown_text = self.read_markdown_file(markdown_file)
|
||||||
md = MarkdownIt()
|
md = MarkdownIt()
|
||||||
tokens = md.parse(markdown_text)
|
tokens = md.parse(markdown_text)
|
||||||
|
|
||||||
self.clear_document_content(document_id)
|
self.update_document_content(tokens, document_id)
|
||||||
self.store_markdown_content(tokens, document_id)
|
|
||||||
|
|
||||||
def read_markdown_file(self, file_path: str) -> str:
|
def read_markdown_file(self, file_path: str) -> str:
|
||||||
"""Read content from a markdown file."""
|
|
||||||
with open(file_path, 'r', encoding='utf-8') as file:
|
with open(file_path, 'r', encoding='utf-8') as file:
|
||||||
return file.read()
|
return file.read()
|
||||||
|
|
||||||
def clear_document_content(self, document_id: int) -> None:
|
def update_document_content(self, tokens: List, document_id: int) -> None:
|
||||||
"""Clear existing content for a document in the database."""
|
existing_structure = self.get_existing_document_structure(document_id)
|
||||||
logging.debug(f"!! DELETING FROM DATABASE, document_id: {document_id}")
|
new_structure = self.parse_new_structure(tokens)
|
||||||
self.db_manager.cursor.execute('DELETE FROM headings WHERE document_id = ?', (document_id,))
|
|
||||||
self.db_manager.cursor.execute('DELETE FROM body WHERE document_id = ?', (document_id,))
|
|
||||||
|
|
||||||
def store_markdown_content(self, tokens: List, document_id: int) -> None:
|
|
||||||
"""Store parsed markdown content in the database."""
|
|
||||||
parent_stack: List[Tuple[int, int]] = [] # (level, heading_id)
|
|
||||||
current_heading_id = None
|
|
||||||
for token in tokens:
|
|
||||||
content_preview = ' '.join(token.content.split()[:10]) + '...' \
|
|
||||||
if len(token.content.split()) > 10 else token.content
|
|
||||||
|
|
||||||
#logging.debug(f"Processing token: {token.type}, content: {content_preview}")
|
|
||||||
if token.type == 'heading_open':
|
|
||||||
level = int(token.tag.strip('h'))
|
|
||||||
content_token = tokens[tokens.index(token) + 1]
|
|
||||||
title = content_token.content
|
|
||||||
|
|
||||||
# Find the appropriate parent
|
|
||||||
while parent_stack and parent_stack[-1][0] >= level:
|
|
||||||
parent_stack.pop()
|
|
||||||
|
|
||||||
parent_id = parent_stack[-1][1] if parent_stack else None
|
|
||||||
current_heading_id = self.insert_heading(level, title, parent_id, document_id)
|
|
||||||
|
|
||||||
parent_stack.append((level, current_heading_id))
|
|
||||||
elif token.type == 'inline' and current_heading_id and token.content.strip():
|
|
||||||
# Only insert non-empty content that's not part of a heading
|
|
||||||
if tokens[tokens.index(token) - 1].type != 'heading_open':
|
|
||||||
self.insert_body(token.content, current_heading_id, document_id)
|
|
||||||
|
|
||||||
self.db_manager.conn.commit()
|
self.merge_structures(existing_structure, new_structure, document_id)
|
||||||
|
|
||||||
|
def get_existing_document_structure(self, document_id: int) -> Dict:
|
||||||
|
structure = {}
|
||||||
|
self.db_manager.cursor.execute('''
|
||||||
|
SELECT h.id, h.level, h.title, h.parent_id, b.content
|
||||||
|
FROM headings h
|
||||||
|
LEFT JOIN body b ON h.id = b.heading_id
|
||||||
|
WHERE h.document_id = ? AND h.isDeleted = 0
|
||||||
|
ORDER BY h.level, h.id
|
||||||
|
''', (document_id,))
|
||||||
|
for heading_id, level, title, parent_id, content in self.db_manager.cursor.fetchall():
|
||||||
|
structure[heading_id] = {
|
||||||
|
'level': level,
|
||||||
|
'title': title,
|
||||||
|
'parent_id': parent_id,
|
||||||
|
'content': content,
|
||||||
|
'children': []
|
||||||
|
}
|
||||||
|
# Build the tree structure
|
||||||
|
root = {}
|
||||||
|
for id, node in structure.items():
|
||||||
|
if node['parent_id'] in structure:
|
||||||
|
structure[node['parent_id']]['children'].append(id)
|
||||||
|
else:
|
||||||
|
root[id] = node
|
||||||
|
return root
|
||||||
|
|
||||||
|
def parse_new_structure(self, tokens: List) -> Dict:
|
||||||
|
structure = {}
|
||||||
|
current_heading = None
|
||||||
|
current_content = []
|
||||||
|
parent_stack = [{"id": None, "level": 0}]
|
||||||
|
|
||||||
|
for token in tokens:
|
||||||
|
if token.type == 'heading_open':
|
||||||
|
if current_heading:
|
||||||
|
structure[current_heading]['content'] = ''.join(current_content).strip()
|
||||||
|
level = int(token.tag.strip('h'))
|
||||||
|
while parent_stack[-1]['level'] >= level:
|
||||||
|
parent_stack.pop()
|
||||||
|
current_heading = str(uuid.uuid4()) # Generate a temporary ID
|
||||||
|
structure[current_heading] = {
|
||||||
|
'level': level,
|
||||||
|
'title': '',
|
||||||
|
'parent_id': parent_stack[-1]['id'],
|
||||||
|
'content': '',
|
||||||
|
'children': []
|
||||||
|
}
|
||||||
|
parent_stack.append({"id": current_heading, "level": level})
|
||||||
|
current_content = []
|
||||||
|
elif token.type == 'heading_close':
|
||||||
|
structure[current_heading]['content'] = ''.join(current_content).strip()
|
||||||
|
elif token.type == 'inline' and current_heading:
|
||||||
|
if structure[current_heading]['title'] == '':
|
||||||
|
structure[current_heading]['title'] = token.content
|
||||||
|
else:
|
||||||
|
current_content.append(token.content)
|
||||||
|
elif current_heading:
|
||||||
|
current_content.append(token.content)
|
||||||
|
|
||||||
|
if current_heading:
|
||||||
|
structure[current_heading]['content'] = ''.join(current_content).strip()
|
||||||
|
|
||||||
|
return structure
|
||||||
|
|
||||||
|
def merge_structures(self, existing: Dict, new: Dict, document_id: int) -> None:
|
||||||
|
def merge_recursive(existing_node, new_node, parent_id):
|
||||||
|
if not existing_node:
|
||||||
|
# This is a new node, insert it
|
||||||
|
heading_id = self.insert_heading(new_node['level'], new_node['title'], parent_id, document_id)
|
||||||
|
self.insert_body(new_node['content'], heading_id, document_id)
|
||||||
|
for child in new_node['children']:
|
||||||
|
merge_recursive(None, new[child], heading_id)
|
||||||
|
else:
|
||||||
|
# Update existing node
|
||||||
|
self.update_heading(existing_node['id'], new_node['title'], new_node['level'], parent_id)
|
||||||
|
self.update_body(existing_node['id'], new_node['content'], document_id)
|
||||||
|
|
||||||
|
# Process children
|
||||||
|
existing_children = {child['title']: child for child in existing_node['children']}
|
||||||
|
new_children = {child['title']: child for child in new_node['children']}
|
||||||
|
|
||||||
|
for title, child in new_children.items():
|
||||||
|
if title in existing_children:
|
||||||
|
merge_recursive(existing_children[title], child, existing_node['id'])
|
||||||
|
else:
|
||||||
|
merge_recursive(None, child, existing_node['id'])
|
||||||
|
|
||||||
|
for title, child in existing_children.items():
|
||||||
|
if title not in new_children:
|
||||||
|
self.soft_delete_heading(child['id'])
|
||||||
|
|
||||||
|
for new_root in new.values():
|
||||||
|
existing_root = next((node for node in existing.values() if node['title'] == new_root['title']), None)
|
||||||
|
merge_recursive(existing_root, new_root, None)
|
||||||
|
|
||||||
def insert_heading(self, level: int, title: str, parent_id: Optional[int], document_id: int) -> int:
|
def insert_heading(self, level: int, title: str, parent_id: Optional[int], document_id: int) -> int:
|
||||||
"""Insert a heading into the database."""
|
|
||||||
logging.debug(f"Inserting title: {title} level: {level}")
|
|
||||||
self.db_manager.cursor.execute('''
|
self.db_manager.cursor.execute('''
|
||||||
INSERT INTO headings (level, title, parent_id, document_id)
|
INSERT INTO headings (level, title, parent_id, document_id)
|
||||||
VALUES (?, ?, ?, ?)
|
VALUES (?, ?, ?, ?)
|
||||||
''', (level, title, parent_id, document_id))
|
''', (level, title, parent_id, document_id))
|
||||||
return self.db_manager.cursor.lastrowid
|
return self.db_manager.cursor.lastrowid
|
||||||
|
|
||||||
|
def update_heading(self, heading_id: int, title: str, level: int, parent_id: Optional[int]) -> None:
|
||||||
|
self.db_manager.cursor.execute('''
|
||||||
|
UPDATE headings
|
||||||
|
SET title = ?, level = ?, parent_id = ?, updated_timestamp = CURRENT_TIMESTAMP
|
||||||
|
WHERE id = ?
|
||||||
|
''', (title, level, parent_id, heading_id))
|
||||||
|
|
||||||
def insert_body(self, content: str, heading_id: int, document_id: int) -> None:
|
def insert_body(self, content: str, heading_id: int, document_id: int) -> None:
|
||||||
"""Insert body content into the database with checksumming."""
|
|
||||||
md5sum = hashlib.md5(content.encode()).hexdigest()
|
md5sum = hashlib.md5(content.encode()).hexdigest()
|
||||||
self.db_manager.cursor.execute('''
|
self.db_manager.cursor.execute('''
|
||||||
INSERT INTO body (content, heading_id, document_id, md5sum)
|
INSERT INTO body (content, heading_id, document_id, md5sum)
|
||||||
VALUES (?, ?, ?, ?)
|
VALUES (?, ?, ?, ?)
|
||||||
''', (content, heading_id, document_id, md5sum))
|
''', (content, heading_id, document_id, md5sum))
|
||||||
|
|
||||||
|
def update_body(self, heading_id: int, content: str, document_id: int) -> None:
|
||||||
|
md5sum = hashlib.md5(content.encode()).hexdigest()
|
||||||
|
self.db_manager.cursor.execute('''
|
||||||
|
UPDATE body
|
||||||
|
SET content = ?, md5sum = ?, updated_timestamp = CURRENT_TIMESTAMP
|
||||||
|
WHERE heading_id = ? AND document_id = ?
|
||||||
|
''', (content, md5sum, heading_id, document_id))
|
||||||
|
|
||||||
|
def soft_delete_heading(self, heading_id: int) -> None:
|
||||||
|
now = datetime.now().isoformat()
|
||||||
|
self.db_manager.cursor.execute('''
|
||||||
|
UPDATE headings
|
||||||
|
SET isDeleted = 1, deleted_timestamp = ?
|
||||||
|
WHERE id = ?
|
||||||
|
''', (now, heading_id))
|
||||||
|
# Also soft delete associated body content
|
||||||
|
self.db_manager.cursor.execute('''
|
||||||
|
UPDATE body
|
||||||
|
SET isDeleted = 1, deleted_timestamp = ?
|
||||||
|
WHERE heading_id = ?
|
||||||
|
''', (now, heading_id))
|
||||||
|
|
||||||
class TopicReader:
|
class TopicReader:
|
||||||
"""Reads and retrieves topics from the database."""
|
"""Reads and retrieves topics from the database."""
|
||||||
|
|
||||||
def __init__(self, db_manager: 'DatabaseManager'):
|
def __init__(self, db_manager: 'DatabaseManager'):
|
||||||
"""
|
|
||||||
Initialize the TopicReader.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
db_manager (DatabaseManager): An instance of DatabaseManager.
|
|
||||||
"""
|
|
||||||
self.db_manager = db_manager
|
self.db_manager = db_manager
|
||||||
|
|
||||||
def fetch_headings(self) -> List[Tuple[int, str, int]]:
|
def fetch_headings(self) -> List[Tuple[int, str, int, Optional[int]]]:
|
||||||
"""
|
self.db_manager.cursor.execute('''
|
||||||
Fetch all non-deleted headings from the database.
|
SELECT id, title, level, parent_id
|
||||||
"""
|
FROM headings
|
||||||
self.db_manager.cursor.execute('SELECT id, title, level FROM headings WHERE isDeleted = 0 ORDER BY level, id')
|
WHERE isDeleted = 0
|
||||||
|
ORDER BY level, id
|
||||||
|
''')
|
||||||
return self.db_manager.cursor.fetchall()
|
return self.db_manager.cursor.fetchall()
|
||||||
|
|
||||||
def fetch_topic_chain(self, heading_id: int) -> List[Tuple[int, str, int]]:
|
def fetch_topic_chain(self, heading_id: int) -> List[Tuple[int, str, int]]:
|
||||||
"""
|
|
||||||
Fetch the topic chain (hierarchy of parent topics) for a given heading.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List[Tuple[int, str, int]]: List of (id, title, level) tuples representing the topic chain.
|
|
||||||
"""
|
|
||||||
chain = []
|
chain = []
|
||||||
current_id = heading_id
|
current_id = heading_id
|
||||||
|
|
||||||
while current_id is not None:
|
while current_id is not None:
|
||||||
self.db_manager.cursor.execute('SELECT id, title, level, parent_id FROM headings WHERE id = ?', (current_id,))
|
self.db_manager.cursor.execute('''
|
||||||
|
SELECT id, title, level, parent_id
|
||||||
|
FROM headings
|
||||||
|
WHERE id = ?
|
||||||
|
''', (current_id,))
|
||||||
result = self.db_manager.cursor.fetchone()
|
result = self.db_manager.cursor.fetchone()
|
||||||
if result:
|
if result:
|
||||||
chain.append((result[0], result[1], result[2]))
|
chain.append((result[0], result[1], result[2]))
|
||||||
@ -247,119 +327,74 @@ class TopicReader:
|
|||||||
break
|
break
|
||||||
|
|
||||||
return list(reversed(chain))
|
return list(reversed(chain))
|
||||||
|
|
||||||
def list_headings(self) -> str:
|
|
||||||
"""
|
|
||||||
List all available headings in a hierarchical structure.
|
|
||||||
|
|
||||||
Returns:
|
def list_headings(self) -> str:
|
||||||
str: A formatted string containing all headings.
|
|
||||||
"""
|
|
||||||
headings = self.fetch_headings()
|
headings = self.fetch_headings()
|
||||||
result = "Available headings:\n"
|
result = "Available headings:\n"
|
||||||
|
|
||||||
for _, title, level in headings:
|
def build_tree(parent_id, level):
|
||||||
indent = " " * (level - 1)
|
tree = ""
|
||||||
result += f"{indent}- {title}\n"
|
for id, title, hlevel, parent in headings:
|
||||||
|
if parent == parent_id:
|
||||||
|
indent = " " * (hlevel - 1)
|
||||||
|
tree += f"{indent}- {title}\n"
|
||||||
|
tree += build_tree(id, hlevel + 1)
|
||||||
|
return tree
|
||||||
|
|
||||||
|
result += build_tree(None, 1)
|
||||||
return result.strip()
|
return result.strip()
|
||||||
|
|
||||||
def fetch_body_and_subtopics(self, heading_id: int, include_subtopics: bool = True, level_offset: int = 0) -> str:
|
|
||||||
"""
|
|
||||||
Fetch body content and subtopics for a given heading with improved Markdown formatting.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
heading_id (int): ID of the heading to fetch.
|
|
||||||
include_subtopics (bool): Whether to include subtopics in the result.
|
|
||||||
level_offset (int): Offset to adjust heading levels for proper nesting.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
str: Formatted string containing the heading content and subtopics.
|
|
||||||
"""
|
|
||||||
# Fetch the current heading and body content
|
|
||||||
self.db_manager.cursor.execute('SELECT level, title FROM headings WHERE id = ?', (heading_id,))
|
|
||||||
level, title = self.db_manager.cursor.fetchone()
|
|
||||||
|
|
||||||
# Adjust the level based on the offset
|
|
||||||
adjusted_level = max(1, level - level_offset)
|
|
||||||
|
|
||||||
# Fetch the content for this heading
|
|
||||||
self.db_manager.cursor.execute('SELECT content FROM body WHERE heading_id = ?', (heading_id,))
|
|
||||||
rows = self.db_manager.cursor.fetchall()
|
|
||||||
body_content = '\n'.join([row[0] for row in rows])
|
|
||||||
|
|
||||||
# Construct the result with proper spacing
|
|
||||||
result = f"\n{'#' * adjusted_level} {title}\n\n"
|
|
||||||
if body_content.strip():
|
|
||||||
result += f"{body_content.strip()}\n\n"
|
|
||||||
|
|
||||||
if include_subtopics:
|
|
||||||
# Fetch all subtopics that are children of the current heading
|
|
||||||
subtopics = self._fetch_subtopics(heading_id, adjusted_level)
|
|
||||||
for subtopic_id, _, _ in subtopics:
|
|
||||||
# Recursively fetch subtopic content
|
|
||||||
subtopic_content = self.fetch_body_and_subtopics(subtopic_id, include_subtopics=True, level_offset=level_offset)
|
|
||||||
result += subtopic_content
|
|
||||||
|
|
||||||
return result.strip() + "\n" # Ensure there's a newline at the end of each section
|
|
||||||
|
|
||||||
def get_topic_content(self, input_title: str) -> Optional[str]:
|
def get_topic_content(self, input_title: str) -> Optional[str]:
|
||||||
"""
|
|
||||||
Get the content of a topic based on the input title, including its topic chain and subtopics.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
str or None: Formatted string containing the topic chain, content, and subtopics, or None if not found.
|
|
||||||
"""
|
|
||||||
heading_id = self.find_closest_heading(input_title)
|
heading_id = self.find_closest_heading(input_title)
|
||||||
if heading_id:
|
if heading_id:
|
||||||
topic_chain = self.fetch_topic_chain(heading_id)
|
topic_chain = self.fetch_topic_chain(heading_id)
|
||||||
result = ""
|
result = self.build_full_content(topic_chain[-1][0])
|
||||||
for i, (id, title, level) in enumerate(topic_chain):
|
return result
|
||||||
if id == heading_id:
|
|
||||||
# Fetch the full content for the selected topic and its subtopics
|
|
||||||
result += self.fetch_body_and_subtopics(id, include_subtopics=True, level_offset=i)
|
|
||||||
else:
|
|
||||||
# Include only the heading chain without duplicating content
|
|
||||||
result += f"\n{'#' * (level - i)} {title}\n\n"
|
|
||||||
return result.strip() + "\n" # Ensure there's a final newline
|
|
||||||
print(f"No topic found matching '{input_title}'.")
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _fetch_subtopics(self, heading_id: int, parent_level: int) -> List[Tuple[int, int, str]]:
|
def build_full_content(self, heading_id: int, level_offset: int = 0) -> str:
|
||||||
"""
|
|
||||||
Fetch all subtopics that are children of the given heading.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of tuples containing the subtopic's ID, level, and title.
|
|
||||||
"""
|
|
||||||
self.db_manager.cursor.execute('''
|
self.db_manager.cursor.execute('''
|
||||||
SELECT id, level, title
|
SELECT h.level, h.title, b.content
|
||||||
FROM headings
|
FROM headings h
|
||||||
|
LEFT JOIN body b ON h.id = b.heading_id
|
||||||
|
WHERE h.id = ? AND h.isDeleted = 0
|
||||||
|
''', (heading_id,))
|
||||||
|
heading = self.db_manager.cursor.fetchone()
|
||||||
|
if not heading:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
level, title, content = heading
|
||||||
|
adjusted_level = max(1, level - level_offset)
|
||||||
|
result = f"{'#' * adjusted_level} {title}\n\n"
|
||||||
|
if content:
|
||||||
|
result += f"{content.strip()}\n\n"
|
||||||
|
|
||||||
|
# Fetch and process all child headings
|
||||||
|
self.db_manager.cursor.execute('''
|
||||||
|
SELECT id FROM headings
|
||||||
WHERE parent_id = ? AND isDeleted = 0
|
WHERE parent_id = ? AND isDeleted = 0
|
||||||
ORDER BY level, id
|
ORDER BY level, id
|
||||||
''', (heading_id,))
|
''', (heading_id,))
|
||||||
return self.db_manager.cursor.fetchall()
|
children = self.db_manager.cursor.fetchall()
|
||||||
|
for child in children:
|
||||||
|
result += self.build_full_content(child[0], level_offset)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
def find_closest_heading(self, input_title: str) -> Optional[int]:
|
def find_closest_heading(self, input_title: str) -> Optional[int]:
|
||||||
"""
|
|
||||||
Find the closest matching heading to the input title using fuzzy matching.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
int or None: ID of the closest matching heading, or None if no match found.
|
|
||||||
"""
|
|
||||||
headings = self.fetch_headings()
|
headings = self.fetch_headings()
|
||||||
if not headings:
|
if not headings:
|
||||||
print("No topics found in the database.")
|
print("No topics found in the database.")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
heading_titles = [title for _, title, _ in headings]
|
heading_titles = [title for _, title, _, _ in headings]
|
||||||
closest_match, confidence = process.extractOne(input_title, heading_titles, scorer=fuzz.token_sort_ratio)
|
closest_match, confidence = process.extractOne(input_title, heading_titles, scorer=fuzz.token_sort_ratio)
|
||||||
|
|
||||||
if confidence < 50:
|
if confidence < 50:
|
||||||
print(f"No close matches found for '{input_title}' (Confidence: {confidence})")
|
print(f"No close matches found for '{input_title}' (Confidence: {confidence})")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
for heading_id, title, level in headings:
|
for heading_id, title, _, _ in headings:
|
||||||
if title == closest_match:
|
if title == closest_match:
|
||||||
return heading_id
|
return heading_id
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user