From b94502be9df3e4ddfb0548db98cdca8f494ac705 Mon Sep 17 00:00:00 2001 From: kalzu rekku Date: Sat, 5 Oct 2024 20:59:32 +0300 Subject: [PATCH] Trying to make sense of the MarkdownProcessor.. --- markdown_sqlite.py | 511 ++++++++++++++++++++++++++------------------- 1 file changed, 291 insertions(+), 220 deletions(-) diff --git a/markdown_sqlite.py b/markdown_sqlite.py index cefbf6d..7f9c95e 100644 --- a/markdown_sqlite.py +++ b/markdown_sqlite.py @@ -43,29 +43,33 @@ class DatabaseManager: CREATE TABLE IF NOT EXISTS headings ( id INTEGER PRIMARY KEY AUTOINCREMENT, + uuid TEXT NOT NULL UNIQUE, level INTEGER NOT NULL, title TEXT NOT NULL, - parent_id INTEGER, + parent_uuid TEXT, document_id INTEGER NOT NULL, + path TEXT NOT NULL, + headings_order INTEGER, added_timestamp DATETIME DEFAULT CURRENT_TIMESTAMP, updated_timestamp DATETIME, deleted_timestamp DATETIME, isDeleted BOOLEAN DEFAULT 0, - FOREIGN KEY (parent_id) REFERENCES headings(id), + FOREIGN KEY (parent_uuid) REFERENCES headings(uuid), FOREIGN KEY (document_id) REFERENCES documents(id) ); CREATE TABLE IF NOT EXISTS body ( id INTEGER PRIMARY KEY AUTOINCREMENT, + uuid TEXT NOT NULL UNIQUE, content TEXT, - heading_id INTEGER NOT NULL, + heading_uuid TEXT NOT NULL, document_id INTEGER NOT NULL, md5sum TEXT, added_timestamp DATETIME DEFAULT CURRENT_TIMESTAMP, updated_timestamp DATETIME, deleted_timestamp DATETIME, isDeleted BOOLEAN DEFAULT 0, - FOREIGN KEY (heading_id) REFERENCES headings(id), + FOREIGN KEY (heading_uuid) REFERENCES headings(uuid), FOREIGN KEY (document_id) REFERENCES documents(id) ); ''') @@ -83,12 +87,6 @@ class DocumentManager: def document_exists(self, document_name: str) -> Optional[Tuple[int]]: """ Check if a document exists in the database. - - Args: - document_name: Name of the document to check. - - Returns: - Document ID if it exists, None otherwise. """ self.db_manager.cursor.execute('SELECT id FROM documents WHERE name = ?', (document_name,)) return self.db_manager.cursor.fetchone() @@ -119,7 +117,7 @@ class DocumentManager: def soft_delete_document(self, document_id: int) -> None: """Soft delete a document by marking it as deleted in the database.""" - logging.debug(f"** This now soft deleted, document_id: {document_id}") + logging.debug(f"** This document is now soft deleted, document_id: {document_id}") now: str = datetime.now().isoformat() self.db_manager.cursor.execute(''' UPDATE documents SET isDeleted = 1, deleted_timestamp = ? WHERE id = ? @@ -146,6 +144,7 @@ class MarkdownProcessor: md = MarkdownIt() tokens = md.parse(markdown_text) + print('### Calling update_document_content') self.update_document_content(tokens, document_id) def read_markdown_file(self, file_path: str) -> str: @@ -153,41 +152,45 @@ class MarkdownProcessor: return file.read() def update_document_content(self, tokens: List, document_id: int) -> None: + existing_structure = {} existing_structure = self.get_existing_document_structure(document_id) - new_structure = self.parse_new_structure(tokens) - + new_structure = self.parse_new_structure(tokens, document_id, existing_structure) + print('### Calling merge_structures...') self.merge_structures(existing_structure, new_structure, document_id) + def get_existing_document_structure(self, document_id: int) -> Dict: structure = {} self.db_manager.cursor.execute(''' - SELECT h.id, h.level, h.title, h.parent_id, b.content + SELECT h.uuid, h.level, h.title, h.parent_uuid, h.path, b.content, b.uuid FROM headings h - LEFT JOIN body b ON h.id = b.heading_id + LEFT JOIN body b ON h.uuid = b.heading_uuid WHERE h.document_id = ? AND h.isDeleted = 0 ORDER BY h.level, h.id ''', (document_id,)) - for heading_id, level, title, parent_id, content in self.db_manager.cursor.fetchall(): - structure[heading_id] = { - 'id': heading_id, # Add this line to include the id in each node + for heading_uuid, level, title, parent_uuid, path, content, body_uuid in self.db_manager.cursor.fetchall(): + structure[heading_uuid] = { + 'uuid': heading_uuid, 'level': level, 'title': title, - 'parent_id': parent_id, + 'parent_uuid': parent_uuid, + 'path': path, 'content': content, + 'body_uuid': body_uuid, 'children': [] } # Build the tree structure - for id, node in structure.items(): - if node['parent_id'] in structure: - structure[node['parent_id']]['children'].append(id) + for uuid, node in structure.items(): + if node['parent_uuid'] in structure: + structure[node['parent_uuid']]['children'].append(uuid) return structure - def parse_new_structure(self, tokens: List) -> Dict: + def parse_new_structure(self, tokens: List, document_id: int, existing_structure: Dict) -> Dict: structure = {} current_heading = None current_content = [] - parent_stack = [{"id": None, "level": 0}] - + parent_stack = [{"uuid": None, "level": 0, "path": ""}] + for token in tokens: if token.type == 'heading_open': if current_heading: @@ -195,43 +198,91 @@ class MarkdownProcessor: level = int(token.tag.strip('h')) while parent_stack[-1]['level'] >= level: parent_stack.pop() - current_heading = str(uuid.uuid4()) # Generate a temporary ID + + parent_path = parent_stack[-1]['path'] + current_heading = str(uuid.uuid4()) # Always assign a new UUID here, may change later + structure[current_heading] = { + 'uuid': current_heading, 'level': level, 'title': '', - 'parent_id': parent_stack[-1]['id'], + 'parent_uuid': parent_stack[-1]['uuid'], + 'path': f"{parent_path}/{current_heading}" if parent_path else current_heading, 'content': '', 'children': [] } - parent_stack.append({"id": current_heading, "level": level}) + parent_stack.append({"uuid": current_heading, "level": level, "path": structure[current_heading]['path']}) current_content = [] + elif token.type == 'heading_close': structure[current_heading]['content'] = ''.join(current_content).strip() + elif token.type == 'inline' and current_heading: if structure[current_heading]['title'] == '': + # Populate the title structure[current_heading]['title'] = token.content + + # Now check for existing UUID based on title, level, and parent + existing_uuid = next( + (uuid for uuid, node in existing_structure.items() + if node['title'] == structure[current_heading]['title'] + and node['level'] == structure[current_heading]['level'] + and node['parent_uuid'] == structure[current_heading]['parent_uuid']), None) + + if existing_uuid: + # If found in existing structure, replace the new UUID + structure[existing_uuid] = structure.pop(current_heading) + current_heading = existing_uuid else: current_content.append(token.content) elif current_heading: current_content.append(token.content) - + if current_heading: structure[current_heading]['content'] = ''.join(current_content).strip() - + return structure def merge_structures(self, existing: Dict, new: Dict, document_id: int) -> None: - def merge_recursive(existing_node, new_node, parent_id): + logging.info(f"Starting merge_structures for document_id: {document_id}") + + def merge_recursive(existing_node, new_node, parent_uuid): + logging.debug(f"Processing node: {new_node['title']}") + if not existing_node: - # This is a new node, insert it - heading_id = self.insert_heading(new_node['level'], new_node['title'], parent_id, document_id) - self.insert_body(new_node['content'], heading_id, document_id) + logging.debug(f"No existing node found for {new_node['title']}") + # Check if a heading with the same title already exists at this level + self.db_manager.cursor.execute(''' + SELECT uuid FROM headings + WHERE title = ? AND level = ? AND parent_uuid = ? AND document_id = ? AND isDeleted = 0 + ''', (new_node['title'], new_node['level'], parent_uuid, document_id)) + existing_uuid = self.db_manager.cursor.fetchone() + + if existing_uuid: + heading_uuid = existing_uuid[0] + logging.info(f"Updating existing heading: {new_node['title']} (UUID: {heading_uuid})") + self.update_heading(heading_uuid, new_node['title'], new_node['level'], parent_uuid, new_node['path']) + else: + logging.info(f"Inserting new heading: {new_node['title']}") + heading_uuid = self.insert_heading(new_node['level'], new_node['title'], parent_uuid, document_id, new_node['path']) + + if new_node['content']: + logging.debug(f"Inserting body content for heading: {new_node['title']}") + body_uuid = self.insert_body(new_node['content'], heading_uuid, document_id) + for child in new_node['children']: - merge_recursive(None, new[child], heading_id) + merge_recursive(None, new[child], heading_uuid) else: + logging.debug(f"Updating existing node: {existing_node['title']}") # Update existing node - self.update_heading(existing_node['id'], new_node['title'], new_node['level'], parent_id) - self.update_body(existing_node['id'], new_node['content'], document_id) + self.update_heading(existing_node['uuid'], new_node['title'], new_node['level'], parent_uuid, new_node['path']) + if new_node['content']: + if existing_node['body_uuid']: + logging.debug(f"Updating body content for heading: {existing_node['title']}") + self.update_body(existing_node['body_uuid'], new_node['content'], document_id) + else: + logging.debug(f"Inserting new body content for existing heading: {existing_node['title']}") + self.insert_body(new_node['content'], existing_node['uuid'], document_id) # Process children existing_children = {child['title']: child for child in existing_node['children']} @@ -239,89 +290,125 @@ class MarkdownProcessor: for title, child in new_children.items(): if title in existing_children: - merge_recursive(existing_children[title], child, existing_node['id']) + merge_recursive(existing_children[title], child, existing_node['uuid']) else: - merge_recursive(None, child, existing_node['id']) + merge_recursive(None, child, existing_node['uuid']) for title, child in existing_children.items(): if title not in new_children: - self.soft_delete_heading(child['id']) - + logging.info(f"Soft deleting heading: {child['title']}") + self.soft_delete_heading(child['uuid']) + for new_root in new.values(): - existing_root = next((node for node in existing.values() if node['title'] == new_root['title']), None) + logging.info(f"Processing root node: {new_root['title']}") + existing_root = next((node for node in existing.values() if node['path'] == new_root['path']), None) merge_recursive(existing_root, new_root, None) + + logging.info("Merge structures completed") - def insert_heading(self, level: int, title: str, parent_id: Optional[int], document_id: int) -> int: + def insert_heading(self, level: int, title: str, parent_uuid: Optional[str], document_id: int, path: str) -> str: + heading_uuid = str(uuid.uuid4()) self.db_manager.cursor.execute(''' - INSERT INTO headings (level, title, parent_id, document_id) - VALUES (?, ?, ?, ?) - ''', (level, title, parent_id, document_id)) - return self.db_manager.cursor.lastrowid + INSERT INTO headings (uuid, level, title, parent_uuid, document_id, path) + VALUES (?, ?, ?, ?, ?, ?) + ''', (heading_uuid, level, title, parent_uuid, document_id, path)) + return heading_uuid - def update_heading(self, heading_id: int, title: str, level: int, parent_id: Optional[int]) -> None: + def update_heading(self, heading_uuid: str, title: str, level: int, parent_uuid: Optional[str], path: str) -> None: self.db_manager.cursor.execute(''' UPDATE headings - SET title = ?, level = ?, parent_id = ?, updated_timestamp = CURRENT_TIMESTAMP - WHERE id = ? - ''', (title, level, parent_id, heading_id)) + SET title = ?, level = ?, parent_uuid = ?, path = ?, updated_timestamp = CURRENT_TIMESTAMP + WHERE uuid = ? + ''', (title, level, parent_uuid, path, heading_uuid)) - def insert_body(self, content: str, heading_id: int, document_id: int) -> None: + def insert_body(self, content: str, heading_uuid: str, document_id: int) -> str: + body_uuid = str(uuid.uuid4()) md5sum = hashlib.md5(content.encode()).hexdigest() - self.db_manager.cursor.execute(''' - INSERT INTO body (content, heading_id, document_id, md5sum) - VALUES (?, ?, ?, ?) - ''', (content, heading_id, document_id, md5sum)) + print(f"###### Trying to insert body text with md5sum of: {md5sum} to uuid: {body_uuid}, with content: \n{content}\n") + + # Verify input parameters + if not all([content, heading_uuid, document_id]): + raise ValueError("Missing required parameters for insert_body") + + try: + # Check if heading_uuid exists + self.db_manager.cursor.execute("SELECT 1 FROM headings WHERE uuid = ?", (heading_uuid,)) + if not self.db_manager.cursor.fetchone(): + raise ValueError(f"heading_uuid {heading_uuid} does not exist in headings table") + + # Check if document_id exists + self.db_manager.cursor.execute("SELECT 1 FROM documents WHERE id = ?", (document_id,)) + if not self.db_manager.cursor.fetchone(): + raise ValueError(f"document_id {document_id} does not exist in documents table") + + # Insert the body + self.db_manager.cursor.execute(''' + INSERT INTO body (uuid, content, heading_uuid, document_id, md5sum) + VALUES (?, ?, ?, ?, ?) + ''', (body_uuid, content, heading_uuid, document_id, md5sum)) + + self.db_manager.conn.commit() + print(f"###### Successfully inserted body with uuid: {body_uuid}") + except sqlite3.Error as e: + print(f"An error occurred while inserting body: {e}") + self.db_manager.conn.rollback() + raise + except ValueError as e: + print(f"Validation error: {e}") + raise + + return body_uuid - def update_body(self, heading_id: int, content: str, document_id: int) -> None: + def update_body(self, body_uuid: str, content: str, document_id: int) -> None: md5sum = hashlib.md5(content.encode()).hexdigest() self.db_manager.cursor.execute(''' UPDATE body SET content = ?, md5sum = ?, updated_timestamp = CURRENT_TIMESTAMP - WHERE heading_id = ? AND document_id = ? - ''', (content, md5sum, heading_id, document_id)) + WHERE uuid = ? AND document_id = ? + ''', (content, md5sum, body_uuid, document_id)) - def soft_delete_heading(self, heading_id: int) -> None: + def soft_delete_heading(self, heading_uuid: str) -> None: now = datetime.now().isoformat() self.db_manager.cursor.execute(''' UPDATE headings SET isDeleted = 1, deleted_timestamp = ? - WHERE id = ? - ''', (now, heading_id)) + WHERE uuid = ? + ''', (now, heading_uuid)) # Also soft delete associated body content self.db_manager.cursor.execute(''' UPDATE body SET isDeleted = 1, deleted_timestamp = ? - WHERE heading_id = ? - ''', (now, heading_id)) + WHERE heading_uuid = ? + ''', (now, heading_uuid)) class TopicReader: """Reads and retrieves topics from the database.""" def __init__(self, db_manager: 'DatabaseManager'): self.db_manager = db_manager - def fetch_headings(self) -> List[Tuple[int, str, int, Optional[int]]]: + def fetch_headings(self) -> List[Tuple[str, str, int, Optional[str]]]: self.db_manager.cursor.execute(''' - SELECT id, title, level, parent_id + SELECT uuid, title, level, parent_uuid FROM headings WHERE isDeleted = 0 - ORDER BY level, id + ORDER BY level, headings_order ''') return self.db_manager.cursor.fetchall() - def fetch_topic_chain(self, heading_id: int) -> List[Tuple[int, str, int]]: + def fetch_topic_chain(self, heading_uuid: str) -> List[Tuple[str, str, int]]: chain = [] - current_id = heading_id + current_uuid = heading_uuid - while current_id is not None: + while current_uuid is not None: self.db_manager.cursor.execute(''' - SELECT id, title, level, parent_id + SELECT uuid, title, level, parent_uuid FROM headings - WHERE id = ? - ''', (current_id,)) + WHERE uuid = ? + ''', (current_uuid,)) result = self.db_manager.cursor.fetchone() if result: chain.append((result[0], result[1], result[2])) - current_id = result[3] + current_uuid = result[3] else: break @@ -331,33 +418,33 @@ class TopicReader: headings = self.fetch_headings() result = "Available headings:\n" - def build_tree(parent_id, level): + def build_tree(parent_uuid, level): tree = "" - for id, title, hlevel, parent in headings: - if parent == parent_id: + for uuid, title, hlevel, parent in headings: + if parent == parent_uuid: indent = " " * (hlevel - 1) tree += f"{indent}- {title}\n" - tree += build_tree(id, hlevel + 1) + tree += build_tree(uuid, hlevel + 1) return tree result += build_tree(None, 1) return result.strip() def get_topic_content(self, input_title: str) -> Optional[str]: - heading_id = self.find_closest_heading(input_title) - if heading_id: - topic_chain = self.fetch_topic_chain(heading_id) + heading_uuid = self.find_closest_heading(input_title) + if heading_uuid: + topic_chain = self.fetch_topic_chain(heading_uuid) result = self.build_full_content(topic_chain[-1][0]) return result return None - def build_full_content(self, heading_id: int, level_offset: int = 0) -> str: + def build_full_content(self, heading_uuid: str, level_offset: int = 0) -> str: self.db_manager.cursor.execute(''' SELECT h.level, h.title, b.content FROM headings h - LEFT JOIN body b ON h.id = b.heading_id - WHERE h.id = ? AND h.isDeleted = 0 - ''', (heading_id,)) + LEFT JOIN body b ON h.uuid = b.heading_uuid + WHERE h.uuid = ? AND h.isDeleted = 0 + ''', (heading_uuid,)) heading = self.db_manager.cursor.fetchone() if not heading: return "" @@ -370,17 +457,17 @@ class TopicReader: # Fetch and process all child headings self.db_manager.cursor.execute(''' - SELECT id FROM headings - WHERE parent_id = ? AND isDeleted = 0 - ORDER BY level, id - ''', (heading_id,)) + SELECT uuid FROM headings + WHERE parent_uuid = ? AND isDeleted = 0 + ORDER BY level, headings_order + ''', (heading_uuid,)) children = self.db_manager.cursor.fetchall() for child in children: result += self.build_full_content(child[0], level_offset) return result - def find_closest_heading(self, input_title: str) -> Optional[int]: + def find_closest_heading(self, input_title: str) -> Optional[str]: headings = self.fetch_headings() if not headings: print("No topics found in the database.") @@ -393,9 +480,9 @@ class TopicReader: print(f"No close matches found for '{input_title}' (Confidence: {confidence})") return None - for heading_id, title, _, _ in headings: + for heading_uuid, title, _, _ in headings: if title == closest_match: - return heading_id + return heading_uuid return None @@ -408,41 +495,57 @@ def compute_file_hash(file_path: str) -> str: for chunk in iter(lambda: f.read(4096), b""): hash_md5.update(chunk) return hash_md5.hexdigest() - -def generate_calendar(year: int) -> str: - """Generate a markdown calendar for the specified year.""" + +def generate_calendar(year: int, db_manager: 'DatabaseManager', document_id: int) -> str: + """ + Generate a markdown calendar for the specified year. + """ calendar_markdown = f"# {year}\n\n" current_date = datetime.now().date() + # Loop through the months for month in range(1, 13): month_name = datetime(year, month, 1).strftime('%B') calendar_markdown += f"## {month:02d} / {month_name}\n\n" - # Calculate the number of days in the month - num_days = (datetime(year, month + 1, 1) - datetime(year, month, 1)).days if month < 12 else (datetime(year + 1, 1, 1) - datetime(year, month, 1)).days - - # Generate calendar entries for each day + # Determine the number of days in the month + if month == 12: + num_days = (datetime(year + 1, 1, 1) - datetime(year, month, 1)).days + else: + num_days = (datetime(year, month + 1, 1) - datetime(year, month, 1)).days + + # Create calendar entries for each day in order for day in range(1, num_days + 1): day_date = datetime(year, month, day).date() day_name = day_date.strftime('%a') - - # Check if this is the current day and make it bold if so + + # Add bold styling for the current date if str(day_date) == str(current_date): calendar_markdown += f"**{day:02d} / {day_name}**\n" else: calendar_markdown += f"{day:02d} / {day_name}\n" - calendar_markdown += "\n" # Add a newline after each month + calendar_markdown += '\n' + # Now parse the markdown and insert into the database + #parse_and_insert_markdown(calendar_markdown, db_manager, document_id, year) + return calendar_markdown -def convert_to_html(markdown_content: str) -> str: +def convert_to_html(markdown_content: str, heading_uuid: Optional[str] = None) -> str: """ - Convert Markdown content to HTML. + Convert Markdown content (or specific section) to HTML. """ md = MarkdownIt() - html_content = md.render(markdown_content) + if heading_uuid: + # Fetch content for a specific heading and its sub-headings from the database + # Example SQL to get heading content based on UUID: + # SELECT title, content FROM headings WHERE uuid = ? + pass + + html_content = md.render(markdown_content) + # Wrap the content in a basic HTML structure html_document = f""" @@ -450,7 +553,7 @@ def convert_to_html(markdown_content: str) -> str: - Calendar + Document