From 3fe7f52066722bddf01bf0cb0f71600747cdf09a Mon Sep 17 00:00:00 2001 From: kalzu rekku Date: Fri, 4 Oct 2024 14:15:35 +0300 Subject: [PATCH] Still broken. Trying to make the update procedure to make sense. --- markdown_sqlite.py | 327 +++++++++++++++++++++++++-------------------- 1 file changed, 181 insertions(+), 146 deletions(-) diff --git a/markdown_sqlite.py b/markdown_sqlite.py index b43ddf7..0ae8861 100644 --- a/markdown_sqlite.py +++ b/markdown_sqlite.py @@ -13,7 +13,7 @@ import hashlib import argparse import logging from datetime import datetime -from typing import List, Tuple, Optional +from typing import List, Tuple, Dict, Set, Optional from markdown_it import MarkdownIt from thefuzz import fuzz, process @@ -137,108 +137,188 @@ class DocumentManager: class MarkdownProcessor: """Processes markdown files and stores content in the database.""" - def __init__(self, db_manager: 'DatabaseManager') -> None: - """Initialize the MarkdownProcessor.""" self.db_manager = db_manager def process_markdown(self, markdown_file: str, document_id: int) -> None: - """Process a markdown file and store its content in the database.""" markdown_text = self.read_markdown_file(markdown_file) md = MarkdownIt() tokens = md.parse(markdown_text) - - self.clear_document_content(document_id) - self.store_markdown_content(tokens, document_id) + + self.update_document_content(tokens, document_id) def read_markdown_file(self, file_path: str) -> str: - """Read content from a markdown file.""" with open(file_path, 'r', encoding='utf-8') as file: return file.read() - def clear_document_content(self, document_id: int) -> None: - """Clear existing content for a document in the database.""" - logging.debug(f"!! DELETING FROM DATABASE, document_id: {document_id}") - self.db_manager.cursor.execute('DELETE FROM headings WHERE document_id = ?', (document_id,)) - self.db_manager.cursor.execute('DELETE FROM body WHERE document_id = ?', (document_id,)) - - def store_markdown_content(self, tokens: List, document_id: int) -> None: - """Store parsed markdown content in the database.""" - parent_stack: List[Tuple[int, int]] = [] # (level, heading_id) - current_heading_id = None - for token in tokens: - content_preview = ' '.join(token.content.split()[:10]) + '...' \ - if len(token.content.split()) > 10 else token.content - - #logging.debug(f"Processing token: {token.type}, content: {content_preview}") - if token.type == 'heading_open': - level = int(token.tag.strip('h')) - content_token = tokens[tokens.index(token) + 1] - title = content_token.content - - # Find the appropriate parent - while parent_stack and parent_stack[-1][0] >= level: - parent_stack.pop() - - parent_id = parent_stack[-1][1] if parent_stack else None - current_heading_id = self.insert_heading(level, title, parent_id, document_id) - - parent_stack.append((level, current_heading_id)) - elif token.type == 'inline' and current_heading_id and token.content.strip(): - # Only insert non-empty content that's not part of a heading - if tokens[tokens.index(token) - 1].type != 'heading_open': - self.insert_body(token.content, current_heading_id, document_id) + def update_document_content(self, tokens: List, document_id: int) -> None: + existing_structure = self.get_existing_document_structure(document_id) + new_structure = self.parse_new_structure(tokens) - self.db_manager.conn.commit() + self.merge_structures(existing_structure, new_structure, document_id) + + def get_existing_document_structure(self, document_id: int) -> Dict: + structure = {} + self.db_manager.cursor.execute(''' + SELECT h.id, h.level, h.title, h.parent_id, b.content + FROM headings h + LEFT JOIN body b ON h.id = b.heading_id + WHERE h.document_id = ? AND h.isDeleted = 0 + ORDER BY h.level, h.id + ''', (document_id,)) + for heading_id, level, title, parent_id, content in self.db_manager.cursor.fetchall(): + structure[heading_id] = { + 'level': level, + 'title': title, + 'parent_id': parent_id, + 'content': content, + 'children': [] + } + # Build the tree structure + root = {} + for id, node in structure.items(): + if node['parent_id'] in structure: + structure[node['parent_id']]['children'].append(id) + else: + root[id] = node + return root + + def parse_new_structure(self, tokens: List) -> Dict: + structure = {} + current_heading = None + current_content = [] + parent_stack = [{"id": None, "level": 0}] + + for token in tokens: + if token.type == 'heading_open': + if current_heading: + structure[current_heading]['content'] = ''.join(current_content).strip() + level = int(token.tag.strip('h')) + while parent_stack[-1]['level'] >= level: + parent_stack.pop() + current_heading = str(uuid.uuid4()) # Generate a temporary ID + structure[current_heading] = { + 'level': level, + 'title': '', + 'parent_id': parent_stack[-1]['id'], + 'content': '', + 'children': [] + } + parent_stack.append({"id": current_heading, "level": level}) + current_content = [] + elif token.type == 'heading_close': + structure[current_heading]['content'] = ''.join(current_content).strip() + elif token.type == 'inline' and current_heading: + if structure[current_heading]['title'] == '': + structure[current_heading]['title'] = token.content + else: + current_content.append(token.content) + elif current_heading: + current_content.append(token.content) + + if current_heading: + structure[current_heading]['content'] = ''.join(current_content).strip() + + return structure + + def merge_structures(self, existing: Dict, new: Dict, document_id: int) -> None: + def merge_recursive(existing_node, new_node, parent_id): + if not existing_node: + # This is a new node, insert it + heading_id = self.insert_heading(new_node['level'], new_node['title'], parent_id, document_id) + self.insert_body(new_node['content'], heading_id, document_id) + for child in new_node['children']: + merge_recursive(None, new[child], heading_id) + else: + # Update existing node + self.update_heading(existing_node['id'], new_node['title'], new_node['level'], parent_id) + self.update_body(existing_node['id'], new_node['content'], document_id) + + # Process children + existing_children = {child['title']: child for child in existing_node['children']} + new_children = {child['title']: child for child in new_node['children']} + + for title, child in new_children.items(): + if title in existing_children: + merge_recursive(existing_children[title], child, existing_node['id']) + else: + merge_recursive(None, child, existing_node['id']) + + for title, child in existing_children.items(): + if title not in new_children: + self.soft_delete_heading(child['id']) + + for new_root in new.values(): + existing_root = next((node for node in existing.values() if node['title'] == new_root['title']), None) + merge_recursive(existing_root, new_root, None) def insert_heading(self, level: int, title: str, parent_id: Optional[int], document_id: int) -> int: - """Insert a heading into the database.""" - logging.debug(f"Inserting title: {title} level: {level}") self.db_manager.cursor.execute(''' INSERT INTO headings (level, title, parent_id, document_id) VALUES (?, ?, ?, ?) ''', (level, title, parent_id, document_id)) return self.db_manager.cursor.lastrowid + def update_heading(self, heading_id: int, title: str, level: int, parent_id: Optional[int]) -> None: + self.db_manager.cursor.execute(''' + UPDATE headings + SET title = ?, level = ?, parent_id = ?, updated_timestamp = CURRENT_TIMESTAMP + WHERE id = ? + ''', (title, level, parent_id, heading_id)) + def insert_body(self, content: str, heading_id: int, document_id: int) -> None: - """Insert body content into the database with checksumming.""" md5sum = hashlib.md5(content.encode()).hexdigest() self.db_manager.cursor.execute(''' INSERT INTO body (content, heading_id, document_id, md5sum) VALUES (?, ?, ?, ?) ''', (content, heading_id, document_id, md5sum)) + def update_body(self, heading_id: int, content: str, document_id: int) -> None: + md5sum = hashlib.md5(content.encode()).hexdigest() + self.db_manager.cursor.execute(''' + UPDATE body + SET content = ?, md5sum = ?, updated_timestamp = CURRENT_TIMESTAMP + WHERE heading_id = ? AND document_id = ? + ''', (content, md5sum, heading_id, document_id)) + + def soft_delete_heading(self, heading_id: int) -> None: + now = datetime.now().isoformat() + self.db_manager.cursor.execute(''' + UPDATE headings + SET isDeleted = 1, deleted_timestamp = ? + WHERE id = ? + ''', (now, heading_id)) + # Also soft delete associated body content + self.db_manager.cursor.execute(''' + UPDATE body + SET isDeleted = 1, deleted_timestamp = ? + WHERE heading_id = ? + ''', (now, heading_id)) + class TopicReader: """Reads and retrieves topics from the database.""" - def __init__(self, db_manager: 'DatabaseManager'): - """ - Initialize the TopicReader. - - Args: - db_manager (DatabaseManager): An instance of DatabaseManager. - """ self.db_manager = db_manager - def fetch_headings(self) -> List[Tuple[int, str, int]]: - """ - Fetch all non-deleted headings from the database. - """ - self.db_manager.cursor.execute('SELECT id, title, level FROM headings WHERE isDeleted = 0 ORDER BY level, id') + def fetch_headings(self) -> List[Tuple[int, str, int, Optional[int]]]: + self.db_manager.cursor.execute(''' + SELECT id, title, level, parent_id + FROM headings + WHERE isDeleted = 0 + ORDER BY level, id + ''') return self.db_manager.cursor.fetchall() def fetch_topic_chain(self, heading_id: int) -> List[Tuple[int, str, int]]: - """ - Fetch the topic chain (hierarchy of parent topics) for a given heading. - - Returns: - List[Tuple[int, str, int]]: List of (id, title, level) tuples representing the topic chain. - """ chain = [] current_id = heading_id while current_id is not None: - self.db_manager.cursor.execute('SELECT id, title, level, parent_id FROM headings WHERE id = ?', (current_id,)) + self.db_manager.cursor.execute(''' + SELECT id, title, level, parent_id + FROM headings + WHERE id = ? + ''', (current_id,)) result = self.db_manager.cursor.fetchone() if result: chain.append((result[0], result[1], result[2])) @@ -247,119 +327,74 @@ class TopicReader: break return list(reversed(chain)) - - def list_headings(self) -> str: - """ - List all available headings in a hierarchical structure. - Returns: - str: A formatted string containing all headings. - """ + def list_headings(self) -> str: headings = self.fetch_headings() result = "Available headings:\n" - for _, title, level in headings: - indent = " " * (level - 1) - result += f"{indent}- {title}\n" + def build_tree(parent_id, level): + tree = "" + for id, title, hlevel, parent in headings: + if parent == parent_id: + indent = " " * (hlevel - 1) + tree += f"{indent}- {title}\n" + tree += build_tree(id, hlevel + 1) + return tree + result += build_tree(None, 1) return result.strip() - def fetch_body_and_subtopics(self, heading_id: int, include_subtopics: bool = True, level_offset: int = 0) -> str: - """ - Fetch body content and subtopics for a given heading with improved Markdown formatting. - - Args: - heading_id (int): ID of the heading to fetch. - include_subtopics (bool): Whether to include subtopics in the result. - level_offset (int): Offset to adjust heading levels for proper nesting. - - Returns: - str: Formatted string containing the heading content and subtopics. - """ - # Fetch the current heading and body content - self.db_manager.cursor.execute('SELECT level, title FROM headings WHERE id = ?', (heading_id,)) - level, title = self.db_manager.cursor.fetchone() - - # Adjust the level based on the offset - adjusted_level = max(1, level - level_offset) - - # Fetch the content for this heading - self.db_manager.cursor.execute('SELECT content FROM body WHERE heading_id = ?', (heading_id,)) - rows = self.db_manager.cursor.fetchall() - body_content = '\n'.join([row[0] for row in rows]) - - # Construct the result with proper spacing - result = f"\n{'#' * adjusted_level} {title}\n\n" - if body_content.strip(): - result += f"{body_content.strip()}\n\n" - - if include_subtopics: - # Fetch all subtopics that are children of the current heading - subtopics = self._fetch_subtopics(heading_id, adjusted_level) - for subtopic_id, _, _ in subtopics: - # Recursively fetch subtopic content - subtopic_content = self.fetch_body_and_subtopics(subtopic_id, include_subtopics=True, level_offset=level_offset) - result += subtopic_content - - return result.strip() + "\n" # Ensure there's a newline at the end of each section - def get_topic_content(self, input_title: str) -> Optional[str]: - """ - Get the content of a topic based on the input title, including its topic chain and subtopics. - - Returns: - str or None: Formatted string containing the topic chain, content, and subtopics, or None if not found. - """ heading_id = self.find_closest_heading(input_title) if heading_id: topic_chain = self.fetch_topic_chain(heading_id) - result = "" - for i, (id, title, level) in enumerate(topic_chain): - if id == heading_id: - # Fetch the full content for the selected topic and its subtopics - result += self.fetch_body_and_subtopics(id, include_subtopics=True, level_offset=i) - else: - # Include only the heading chain without duplicating content - result += f"\n{'#' * (level - i)} {title}\n\n" - return result.strip() + "\n" # Ensure there's a final newline - print(f"No topic found matching '{input_title}'.") + result = self.build_full_content(topic_chain[-1][0]) + return result return None - def _fetch_subtopics(self, heading_id: int, parent_level: int) -> List[Tuple[int, int, str]]: - """ - Fetch all subtopics that are children of the given heading. - - Returns: - List of tuples containing the subtopic's ID, level, and title. - """ + def build_full_content(self, heading_id: int, level_offset: int = 0) -> str: self.db_manager.cursor.execute(''' - SELECT id, level, title - FROM headings + SELECT h.level, h.title, b.content + FROM headings h + LEFT JOIN body b ON h.id = b.heading_id + WHERE h.id = ? AND h.isDeleted = 0 + ''', (heading_id,)) + heading = self.db_manager.cursor.fetchone() + if not heading: + return "" + + level, title, content = heading + adjusted_level = max(1, level - level_offset) + result = f"{'#' * adjusted_level} {title}\n\n" + if content: + result += f"{content.strip()}\n\n" + + # Fetch and process all child headings + self.db_manager.cursor.execute(''' + SELECT id FROM headings WHERE parent_id = ? AND isDeleted = 0 ORDER BY level, id ''', (heading_id,)) - return self.db_manager.cursor.fetchall() + children = self.db_manager.cursor.fetchall() + for child in children: + result += self.build_full_content(child[0], level_offset) + + return result def find_closest_heading(self, input_title: str) -> Optional[int]: - """ - Find the closest matching heading to the input title using fuzzy matching. - - Returns: - int or None: ID of the closest matching heading, or None if no match found. - """ headings = self.fetch_headings() if not headings: print("No topics found in the database.") return None - heading_titles = [title for _, title, _ in headings] + heading_titles = [title for _, title, _, _ in headings] closest_match, confidence = process.extractOne(input_title, heading_titles, scorer=fuzz.token_sort_ratio) if confidence < 50: print(f"No close matches found for '{input_title}' (Confidence: {confidence})") return None - for heading_id, title, level in headings: + for heading_id, title, _, _ in headings: if title == closest_match: return heading_id