Still broken. Trying to make the update procedure to make sense.

This commit is contained in:
kalzu rekku 2024-10-04 14:15:35 +03:00
parent 865b0d225d
commit 3fe7f52066

View File

@ -13,7 +13,7 @@ import hashlib
import argparse import argparse
import logging import logging
from datetime import datetime from datetime import datetime
from typing import List, Tuple, Optional from typing import List, Tuple, Dict, Set, Optional
from markdown_it import MarkdownIt from markdown_it import MarkdownIt
from thefuzz import fuzz, process from thefuzz import fuzz, process
@ -137,108 +137,188 @@ class DocumentManager:
class MarkdownProcessor: class MarkdownProcessor:
"""Processes markdown files and stores content in the database.""" """Processes markdown files and stores content in the database."""
def __init__(self, db_manager: 'DatabaseManager') -> None: def __init__(self, db_manager: 'DatabaseManager') -> None:
"""Initialize the MarkdownProcessor."""
self.db_manager = db_manager self.db_manager = db_manager
def process_markdown(self, markdown_file: str, document_id: int) -> None: def process_markdown(self, markdown_file: str, document_id: int) -> None:
"""Process a markdown file and store its content in the database."""
markdown_text = self.read_markdown_file(markdown_file) markdown_text = self.read_markdown_file(markdown_file)
md = MarkdownIt() md = MarkdownIt()
tokens = md.parse(markdown_text) tokens = md.parse(markdown_text)
self.clear_document_content(document_id) self.update_document_content(tokens, document_id)
self.store_markdown_content(tokens, document_id)
def read_markdown_file(self, file_path: str) -> str: def read_markdown_file(self, file_path: str) -> str:
"""Read content from a markdown file."""
with open(file_path, 'r', encoding='utf-8') as file: with open(file_path, 'r', encoding='utf-8') as file:
return file.read() return file.read()
def clear_document_content(self, document_id: int) -> None: def update_document_content(self, tokens: List, document_id: int) -> None:
"""Clear existing content for a document in the database.""" existing_structure = self.get_existing_document_structure(document_id)
logging.debug(f"!! DELETING FROM DATABASE, document_id: {document_id}") new_structure = self.parse_new_structure(tokens)
self.db_manager.cursor.execute('DELETE FROM headings WHERE document_id = ?', (document_id,))
self.db_manager.cursor.execute('DELETE FROM body WHERE document_id = ?', (document_id,))
def store_markdown_content(self, tokens: List, document_id: int) -> None:
"""Store parsed markdown content in the database."""
parent_stack: List[Tuple[int, int]] = [] # (level, heading_id)
current_heading_id = None
for token in tokens:
content_preview = ' '.join(token.content.split()[:10]) + '...' \
if len(token.content.split()) > 10 else token.content
#logging.debug(f"Processing token: {token.type}, content: {content_preview}")
if token.type == 'heading_open':
level = int(token.tag.strip('h'))
content_token = tokens[tokens.index(token) + 1]
title = content_token.content
# Find the appropriate parent
while parent_stack and parent_stack[-1][0] >= level:
parent_stack.pop()
parent_id = parent_stack[-1][1] if parent_stack else None
current_heading_id = self.insert_heading(level, title, parent_id, document_id)
parent_stack.append((level, current_heading_id))
elif token.type == 'inline' and current_heading_id and token.content.strip():
# Only insert non-empty content that's not part of a heading
if tokens[tokens.index(token) - 1].type != 'heading_open':
self.insert_body(token.content, current_heading_id, document_id)
self.db_manager.conn.commit() self.merge_structures(existing_structure, new_structure, document_id)
def get_existing_document_structure(self, document_id: int) -> Dict:
structure = {}
self.db_manager.cursor.execute('''
SELECT h.id, h.level, h.title, h.parent_id, b.content
FROM headings h
LEFT JOIN body b ON h.id = b.heading_id
WHERE h.document_id = ? AND h.isDeleted = 0
ORDER BY h.level, h.id
''', (document_id,))
for heading_id, level, title, parent_id, content in self.db_manager.cursor.fetchall():
structure[heading_id] = {
'level': level,
'title': title,
'parent_id': parent_id,
'content': content,
'children': []
}
# Build the tree structure
root = {}
for id, node in structure.items():
if node['parent_id'] in structure:
structure[node['parent_id']]['children'].append(id)
else:
root[id] = node
return root
def parse_new_structure(self, tokens: List) -> Dict:
structure = {}
current_heading = None
current_content = []
parent_stack = [{"id": None, "level": 0}]
for token in tokens:
if token.type == 'heading_open':
if current_heading:
structure[current_heading]['content'] = ''.join(current_content).strip()
level = int(token.tag.strip('h'))
while parent_stack[-1]['level'] >= level:
parent_stack.pop()
current_heading = str(uuid.uuid4()) # Generate a temporary ID
structure[current_heading] = {
'level': level,
'title': '',
'parent_id': parent_stack[-1]['id'],
'content': '',
'children': []
}
parent_stack.append({"id": current_heading, "level": level})
current_content = []
elif token.type == 'heading_close':
structure[current_heading]['content'] = ''.join(current_content).strip()
elif token.type == 'inline' and current_heading:
if structure[current_heading]['title'] == '':
structure[current_heading]['title'] = token.content
else:
current_content.append(token.content)
elif current_heading:
current_content.append(token.content)
if current_heading:
structure[current_heading]['content'] = ''.join(current_content).strip()
return structure
def merge_structures(self, existing: Dict, new: Dict, document_id: int) -> None:
def merge_recursive(existing_node, new_node, parent_id):
if not existing_node:
# This is a new node, insert it
heading_id = self.insert_heading(new_node['level'], new_node['title'], parent_id, document_id)
self.insert_body(new_node['content'], heading_id, document_id)
for child in new_node['children']:
merge_recursive(None, new[child], heading_id)
else:
# Update existing node
self.update_heading(existing_node['id'], new_node['title'], new_node['level'], parent_id)
self.update_body(existing_node['id'], new_node['content'], document_id)
# Process children
existing_children = {child['title']: child for child in existing_node['children']}
new_children = {child['title']: child for child in new_node['children']}
for title, child in new_children.items():
if title in existing_children:
merge_recursive(existing_children[title], child, existing_node['id'])
else:
merge_recursive(None, child, existing_node['id'])
for title, child in existing_children.items():
if title not in new_children:
self.soft_delete_heading(child['id'])
for new_root in new.values():
existing_root = next((node for node in existing.values() if node['title'] == new_root['title']), None)
merge_recursive(existing_root, new_root, None)
def insert_heading(self, level: int, title: str, parent_id: Optional[int], document_id: int) -> int: def insert_heading(self, level: int, title: str, parent_id: Optional[int], document_id: int) -> int:
"""Insert a heading into the database."""
logging.debug(f"Inserting title: {title} level: {level}")
self.db_manager.cursor.execute(''' self.db_manager.cursor.execute('''
INSERT INTO headings (level, title, parent_id, document_id) INSERT INTO headings (level, title, parent_id, document_id)
VALUES (?, ?, ?, ?) VALUES (?, ?, ?, ?)
''', (level, title, parent_id, document_id)) ''', (level, title, parent_id, document_id))
return self.db_manager.cursor.lastrowid return self.db_manager.cursor.lastrowid
def update_heading(self, heading_id: int, title: str, level: int, parent_id: Optional[int]) -> None:
self.db_manager.cursor.execute('''
UPDATE headings
SET title = ?, level = ?, parent_id = ?, updated_timestamp = CURRENT_TIMESTAMP
WHERE id = ?
''', (title, level, parent_id, heading_id))
def insert_body(self, content: str, heading_id: int, document_id: int) -> None: def insert_body(self, content: str, heading_id: int, document_id: int) -> None:
"""Insert body content into the database with checksumming."""
md5sum = hashlib.md5(content.encode()).hexdigest() md5sum = hashlib.md5(content.encode()).hexdigest()
self.db_manager.cursor.execute(''' self.db_manager.cursor.execute('''
INSERT INTO body (content, heading_id, document_id, md5sum) INSERT INTO body (content, heading_id, document_id, md5sum)
VALUES (?, ?, ?, ?) VALUES (?, ?, ?, ?)
''', (content, heading_id, document_id, md5sum)) ''', (content, heading_id, document_id, md5sum))
def update_body(self, heading_id: int, content: str, document_id: int) -> None:
md5sum = hashlib.md5(content.encode()).hexdigest()
self.db_manager.cursor.execute('''
UPDATE body
SET content = ?, md5sum = ?, updated_timestamp = CURRENT_TIMESTAMP
WHERE heading_id = ? AND document_id = ?
''', (content, md5sum, heading_id, document_id))
def soft_delete_heading(self, heading_id: int) -> None:
now = datetime.now().isoformat()
self.db_manager.cursor.execute('''
UPDATE headings
SET isDeleted = 1, deleted_timestamp = ?
WHERE id = ?
''', (now, heading_id))
# Also soft delete associated body content
self.db_manager.cursor.execute('''
UPDATE body
SET isDeleted = 1, deleted_timestamp = ?
WHERE heading_id = ?
''', (now, heading_id))
class TopicReader: class TopicReader:
"""Reads and retrieves topics from the database.""" """Reads and retrieves topics from the database."""
def __init__(self, db_manager: 'DatabaseManager'): def __init__(self, db_manager: 'DatabaseManager'):
"""
Initialize the TopicReader.
Args:
db_manager (DatabaseManager): An instance of DatabaseManager.
"""
self.db_manager = db_manager self.db_manager = db_manager
def fetch_headings(self) -> List[Tuple[int, str, int]]: def fetch_headings(self) -> List[Tuple[int, str, int, Optional[int]]]:
""" self.db_manager.cursor.execute('''
Fetch all non-deleted headings from the database. SELECT id, title, level, parent_id
""" FROM headings
self.db_manager.cursor.execute('SELECT id, title, level FROM headings WHERE isDeleted = 0 ORDER BY level, id') WHERE isDeleted = 0
ORDER BY level, id
''')
return self.db_manager.cursor.fetchall() return self.db_manager.cursor.fetchall()
def fetch_topic_chain(self, heading_id: int) -> List[Tuple[int, str, int]]: def fetch_topic_chain(self, heading_id: int) -> List[Tuple[int, str, int]]:
"""
Fetch the topic chain (hierarchy of parent topics) for a given heading.
Returns:
List[Tuple[int, str, int]]: List of (id, title, level) tuples representing the topic chain.
"""
chain = [] chain = []
current_id = heading_id current_id = heading_id
while current_id is not None: while current_id is not None:
self.db_manager.cursor.execute('SELECT id, title, level, parent_id FROM headings WHERE id = ?', (current_id,)) self.db_manager.cursor.execute('''
SELECT id, title, level, parent_id
FROM headings
WHERE id = ?
''', (current_id,))
result = self.db_manager.cursor.fetchone() result = self.db_manager.cursor.fetchone()
if result: if result:
chain.append((result[0], result[1], result[2])) chain.append((result[0], result[1], result[2]))
@ -247,119 +327,74 @@ class TopicReader:
break break
return list(reversed(chain)) return list(reversed(chain))
def list_headings(self) -> str:
"""
List all available headings in a hierarchical structure.
Returns: def list_headings(self) -> str:
str: A formatted string containing all headings.
"""
headings = self.fetch_headings() headings = self.fetch_headings()
result = "Available headings:\n" result = "Available headings:\n"
for _, title, level in headings: def build_tree(parent_id, level):
indent = " " * (level - 1) tree = ""
result += f"{indent}- {title}\n" for id, title, hlevel, parent in headings:
if parent == parent_id:
indent = " " * (hlevel - 1)
tree += f"{indent}- {title}\n"
tree += build_tree(id, hlevel + 1)
return tree
result += build_tree(None, 1)
return result.strip() return result.strip()
def fetch_body_and_subtopics(self, heading_id: int, include_subtopics: bool = True, level_offset: int = 0) -> str:
"""
Fetch body content and subtopics for a given heading with improved Markdown formatting.
Args:
heading_id (int): ID of the heading to fetch.
include_subtopics (bool): Whether to include subtopics in the result.
level_offset (int): Offset to adjust heading levels for proper nesting.
Returns:
str: Formatted string containing the heading content and subtopics.
"""
# Fetch the current heading and body content
self.db_manager.cursor.execute('SELECT level, title FROM headings WHERE id = ?', (heading_id,))
level, title = self.db_manager.cursor.fetchone()
# Adjust the level based on the offset
adjusted_level = max(1, level - level_offset)
# Fetch the content for this heading
self.db_manager.cursor.execute('SELECT content FROM body WHERE heading_id = ?', (heading_id,))
rows = self.db_manager.cursor.fetchall()
body_content = '\n'.join([row[0] for row in rows])
# Construct the result with proper spacing
result = f"\n{'#' * adjusted_level} {title}\n\n"
if body_content.strip():
result += f"{body_content.strip()}\n\n"
if include_subtopics:
# Fetch all subtopics that are children of the current heading
subtopics = self._fetch_subtopics(heading_id, adjusted_level)
for subtopic_id, _, _ in subtopics:
# Recursively fetch subtopic content
subtopic_content = self.fetch_body_and_subtopics(subtopic_id, include_subtopics=True, level_offset=level_offset)
result += subtopic_content
return result.strip() + "\n" # Ensure there's a newline at the end of each section
def get_topic_content(self, input_title: str) -> Optional[str]: def get_topic_content(self, input_title: str) -> Optional[str]:
"""
Get the content of a topic based on the input title, including its topic chain and subtopics.
Returns:
str or None: Formatted string containing the topic chain, content, and subtopics, or None if not found.
"""
heading_id = self.find_closest_heading(input_title) heading_id = self.find_closest_heading(input_title)
if heading_id: if heading_id:
topic_chain = self.fetch_topic_chain(heading_id) topic_chain = self.fetch_topic_chain(heading_id)
result = "" result = self.build_full_content(topic_chain[-1][0])
for i, (id, title, level) in enumerate(topic_chain): return result
if id == heading_id:
# Fetch the full content for the selected topic and its subtopics
result += self.fetch_body_and_subtopics(id, include_subtopics=True, level_offset=i)
else:
# Include only the heading chain without duplicating content
result += f"\n{'#' * (level - i)} {title}\n\n"
return result.strip() + "\n" # Ensure there's a final newline
print(f"No topic found matching '{input_title}'.")
return None return None
def _fetch_subtopics(self, heading_id: int, parent_level: int) -> List[Tuple[int, int, str]]: def build_full_content(self, heading_id: int, level_offset: int = 0) -> str:
"""
Fetch all subtopics that are children of the given heading.
Returns:
List of tuples containing the subtopic's ID, level, and title.
"""
self.db_manager.cursor.execute(''' self.db_manager.cursor.execute('''
SELECT id, level, title SELECT h.level, h.title, b.content
FROM headings FROM headings h
LEFT JOIN body b ON h.id = b.heading_id
WHERE h.id = ? AND h.isDeleted = 0
''', (heading_id,))
heading = self.db_manager.cursor.fetchone()
if not heading:
return ""
level, title, content = heading
adjusted_level = max(1, level - level_offset)
result = f"{'#' * adjusted_level} {title}\n\n"
if content:
result += f"{content.strip()}\n\n"
# Fetch and process all child headings
self.db_manager.cursor.execute('''
SELECT id FROM headings
WHERE parent_id = ? AND isDeleted = 0 WHERE parent_id = ? AND isDeleted = 0
ORDER BY level, id ORDER BY level, id
''', (heading_id,)) ''', (heading_id,))
return self.db_manager.cursor.fetchall() children = self.db_manager.cursor.fetchall()
for child in children:
result += self.build_full_content(child[0], level_offset)
return result
def find_closest_heading(self, input_title: str) -> Optional[int]: def find_closest_heading(self, input_title: str) -> Optional[int]:
"""
Find the closest matching heading to the input title using fuzzy matching.
Returns:
int or None: ID of the closest matching heading, or None if no match found.
"""
headings = self.fetch_headings() headings = self.fetch_headings()
if not headings: if not headings:
print("No topics found in the database.") print("No topics found in the database.")
return None return None
heading_titles = [title for _, title, _ in headings] heading_titles = [title for _, title, _, _ in headings]
closest_match, confidence = process.extractOne(input_title, heading_titles, scorer=fuzz.token_sort_ratio) closest_match, confidence = process.extractOne(input_title, heading_titles, scorer=fuzz.token_sort_ratio)
if confidence < 50: if confidence < 50:
print(f"No close matches found for '{input_title}' (Confidence: {confidence})") print(f"No close matches found for '{input_title}' (Confidence: {confidence})")
return None return None
for heading_id, title, level in headings: for heading_id, title, _, _ in headings:
if title == closest_match: if title == closest_match:
return heading_id return heading_id