Compare commits

...

2 Commits

2 changed files with 605 additions and 220 deletions

314
MarkdownProcessor.py Normal file
View File

@ -0,0 +1,314 @@
import sqlite3
import uuid
import hashlib
import logging
from datetime import datetime
from typing import List, Dict, Optional
from markdown_it import MarkdownIt
class DatabaseManager:
"""Manages database connections and table creation."""
def __init__(self, db_file: str):
"""Initialize the DatabaseManager."""
self.conn: sqlite3.Connection = sqlite3.connect(db_file, timeout=10)
self.cursor: sqlite3.Cursor = self.conn.cursor()
self.create_tables()
def create_tables(self) -> None:
"""Create necessary tables in the database if they don't exist."""
self.cursor.executescript('''
CREATE TABLE IF NOT EXISTS documents (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL UNIQUE,
file_path TEXT NOT NULL,
md5sum TEXT,
added_timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
updated_timestamp DATETIME,
deleted_timestamp DATETIME,
isDeleted BOOLEAN DEFAULT 0
);
CREATE TABLE IF NOT EXISTS headings (
id INTEGER PRIMARY KEY AUTOINCREMENT,
uuid TEXT NOT NULL UNIQUE,
level INTEGER NOT NULL,
title TEXT NOT NULL,
parent_uuid TEXT,
document_id INTEGER NOT NULL,
path TEXT NOT NULL,
headings_order INTEGER,
added_timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
updated_timestamp DATETIME,
deleted_timestamp DATETIME,
isDeleted BOOLEAN DEFAULT 0,
FOREIGN KEY (parent_uuid) REFERENCES headings(uuid),
FOREIGN KEY (document_id) REFERENCES documents(id)
);
CREATE TABLE IF NOT EXISTS body (
id INTEGER PRIMARY KEY AUTOINCREMENT,
uuid TEXT NOT NULL UNIQUE,
content TEXT,
heading_uuid TEXT NOT NULL,
document_id INTEGER NOT NULL,
md5sum TEXT,
added_timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
updated_timestamp DATETIME,
deleted_timestamp DATETIME,
isDeleted BOOLEAN DEFAULT 0,
FOREIGN KEY (heading_uuid) REFERENCES headings(uuid),
FOREIGN KEY (document_id) REFERENCES documents(id)
);
''')
def close(self) -> None:
"""Close the database connection."""
self.conn.close()
class MarkdownProcessor:
"""Processes markdown files and stores content in the database."""
def __init__(self, db_manager: 'DatabaseManager') -> None:
self.db_manager = db_manager
def process_markdown(self, markdown_file: str, document_id: int) -> None:
markdown_text = self.read_markdown_file(markdown_file)
md = MarkdownIt()
tokens = md.parse(markdown_text)
print('### Calling update_document_content')
self.update_document_content(tokens, document_id)
def read_markdown_file(self, file_path: str) -> str:
with open(file_path, 'r', encoding='utf-8') as file:
return file.read()
def update_document_content(self, tokens: List, document_id: int) -> None:
existing_structure = {}
existing_structure = self.get_existing_document_structure(document_id)
new_structure = self.parse_new_structure(tokens, document_id, existing_structure)
print('### Calling merge_structures...')
self.merge_structures(existing_structure, new_structure, document_id)
def get_existing_document_structure(self, document_id: int) -> Dict:
structure = {}
self.db_manager.cursor.execute('''
SELECT h.uuid, h.level, h.title, h.parent_uuid, h.path, b.content, b.uuid
FROM headings h
LEFT JOIN body b ON h.uuid = b.heading_uuid
WHERE h.document_id = ? AND h.isDeleted = 0
ORDER BY h.level, h.id
''', (document_id,))
for heading_uuid, level, title, parent_uuid, path, content, body_uuid in self.db_manager.cursor.fetchall():
structure[heading_uuid] = {
'uuid': heading_uuid,
'level': level,
'title': title,
'parent_uuid': parent_uuid,
'path': path,
'content': content,
'body_uuid': body_uuid,
'children': []
}
# Build the tree structure
for uuid, node in structure.items():
if node['parent_uuid'] in structure:
structure[node['parent_uuid']]['children'].append(uuid)
return structure
def parse_new_structure(self, tokens: List, document_id: int, existing_structure: Dict) -> Dict:
structure = {}
current_heading = None
current_content = []
parent_stack = [{"uuid": None, "level": 0, "path": ""}]
for token in tokens:
if token.type == 'heading_open':
if current_heading:
structure[current_heading]['content'] = ''.join(current_content).strip()
level = int(token.tag.strip('h'))
while parent_stack[-1]['level'] >= level:
parent_stack.pop()
parent_path = parent_stack[-1]['path']
current_heading = str(uuid.uuid4()) # Always assign a new UUID here, may change later
structure[current_heading] = {
'uuid': current_heading,
'level': level,
'title': '',
'parent_uuid': parent_stack[-1]['uuid'],
'path': f"{parent_path}/{current_heading}" if parent_path else current_heading,
'content': '',
'children': []
}
parent_stack.append({"uuid": current_heading, "level": level, "path": structure[current_heading]['path']})
current_content = []
elif token.type == 'heading_close':
structure[current_heading]['content'] = ''.join(current_content).strip()
elif token.type == 'inline' and current_heading:
if structure[current_heading]['title'] == '':
# Populate the title
structure[current_heading]['title'] = token.content
# Now check for existing UUID based on title, level, and parent
existing_uuid = next(
(uuid for uuid, node in existing_structure.items()
if node['title'] == structure[current_heading]['title']
and node['level'] == structure[current_heading]['level']
and node['parent_uuid'] == structure[current_heading]['parent_uuid']), None)
if existing_uuid:
# If found in existing structure, replace the new UUID
structure[existing_uuid] = structure.pop(current_heading)
current_heading = existing_uuid
else:
current_content.append(token.content)
elif current_heading:
current_content.append(token.content)
if current_heading:
structure[current_heading]['content'] = ''.join(current_content).strip()
return structure
def merge_structures(self, existing: Dict, new: Dict, document_id: int) -> None:
logging.info(f"Starting merge_structures for document_id: {document_id}")
def merge_recursive(existing_node, new_node, parent_uuid):
logging.debug(f"Processing node: {new_node['title']}")
if not existing_node:
logging.debug(f"No existing node found for {new_node['title']}")
# Check if a heading with the same title already exists at this level
self.db_manager.cursor.execute('''
SELECT uuid FROM headings
WHERE title = ? AND level = ? AND parent_uuid = ? AND document_id = ? AND isDeleted = 0
''', (new_node['title'], new_node['level'], parent_uuid, document_id))
existing_uuid = self.db_manager.cursor.fetchone()
if existing_uuid:
heading_uuid = existing_uuid[0]
logging.info(f"Updating existing heading: {new_node['title']} (UUID: {heading_uuid})")
self.update_heading(heading_uuid, new_node['title'], new_node['level'], parent_uuid, new_node['path'])
else:
logging.info(f"Inserting new heading: {new_node['title']}")
heading_uuid = self.insert_heading(new_node['level'], new_node['title'], parent_uuid, document_id, new_node['path'])
if new_node['content']:
logging.debug(f"Inserting body content for heading: {new_node['title']}")
body_uuid = self.insert_body(new_node['content'], heading_uuid, document_id)
for child in new_node['children']:
merge_recursive(None, new[child], heading_uuid)
else:
logging.debug(f"Updating existing node: {existing_node['title']}")
# Update existing node
self.update_heading(existing_node['uuid'], new_node['title'], new_node['level'], parent_uuid, new_node['path'])
if new_node['content']:
if existing_node['body_uuid']:
logging.debug(f"Updating body content for heading: {existing_node['title']}")
self.update_body(existing_node['body_uuid'], new_node['content'], document_id)
else:
logging.debug(f"Inserting new body content for existing heading: {existing_node['title']}")
self.insert_body(new_node['content'], existing_node['uuid'], document_id)
# Process children
existing_children = {child['title']: child for child in existing_node['children']}
new_children = {child['title']: child for child in new_node['children']}
for title, child in new_children.items():
if title in existing_children:
merge_recursive(existing_children[title], child, existing_node['uuid'])
else:
merge_recursive(None, child, existing_node['uuid'])
for title, child in existing_children.items():
if title not in new_children:
logging.info(f"Soft deleting heading: {child['title']}")
self.soft_delete_heading(child['uuid'])
for new_root in new.values():
logging.info(f"Processing root node: {new_root['title']}")
existing_root = next((node for node in existing.values() if node['path'] == new_root['path']), None)
merge_recursive(existing_root, new_root, None)
logging.info("Merge structures completed")
def insert_heading(self, level: int, title: str, parent_uuid: Optional[str], document_id: int, path: str) -> str:
heading_uuid = str(uuid.uuid4())
self.db_manager.cursor.execute('''
INSERT INTO headings (uuid, level, title, parent_uuid, document_id, path)
VALUES (?, ?, ?, ?, ?, ?)
''', (heading_uuid, level, title, parent_uuid, document_id, path))
return heading_uuid
def update_heading(self, heading_uuid: str, title: str, level: int, parent_uuid: Optional[str], path: str) -> None:
self.db_manager.cursor.execute('''
UPDATE headings
SET title = ?, level = ?, parent_uuid = ?, path = ?, updated_timestamp = CURRENT_TIMESTAMP
WHERE uuid = ?
''', (title, level, parent_uuid, path, heading_uuid))
def insert_body(self, content: str, heading_uuid: str, document_id: int) -> str:
body_uuid = str(uuid.uuid4())
md5sum = hashlib.md5(content.encode()).hexdigest()
print(f"###### Trying to insert body text with md5sum of: {md5sum} to uuid: {body_uuid}, with content: \n{content}\n")
# Verify input parameters
if not all([content, heading_uuid, document_id]):
raise ValueError("Missing required parameters for insert_body")
try:
# Check if heading_uuid exists
self.db_manager.cursor.execute("SELECT 1 FROM headings WHERE uuid = ?", (heading_uuid,))
if not self.db_manager.cursor.fetchone():
raise ValueError(f"heading_uuid {heading_uuid} does not exist in headings table")
# Check if document_id exists
self.db_manager.cursor.execute("SELECT 1 FROM documents WHERE id = ?", (document_id,))
if not self.db_manager.cursor.fetchone():
raise ValueError(f"document_id {document_id} does not exist in documents table")
# Insert the body
self.db_manager.cursor.execute('''
INSERT INTO body (uuid, content, heading_uuid, document_id, md5sum)
VALUES (?, ?, ?, ?, ?)
''', (body_uuid, content, heading_uuid, document_id, md5sum))
self.db_manager.conn.commit()
print(f"###### Successfully inserted body with uuid: {body_uuid}")
except sqlite3.Error as e:
print(f"An error occurred while inserting body: {e}")
self.db_manager.conn.rollback()
raise
except ValueError as e:
print(f"Validation error: {e}")
raise
return body_uuid
def update_body(self, body_uuid: str, content: str, document_id: int) -> None:
md5sum = hashlib.md5(content.encode()).hexdigest()
self.db_manager.cursor.execute('''
UPDATE body
SET content = ?, md5sum = ?, updated_timestamp = CURRENT_TIMESTAMP
WHERE uuid = ? AND document_id = ?
''', (content, md5sum, body_uuid, document_id))
def soft_delete_heading(self, heading_uuid: str) -> None:
now = datetime.now().isoformat()
self.db_manager.cursor.execute('''
UPDATE headings
SET isDeleted = 1, deleted_timestamp = ?
WHERE uuid = ?
''', (now, heading_uuid))
# Also soft delete associated body content
self.db_manager.cursor.execute('''
UPDATE body
SET isDeleted = 1, deleted_timestamp = ?
WHERE heading_uuid = ?
''', (now, heading_uuid))

View File

@ -43,29 +43,33 @@ class DatabaseManager:
CREATE TABLE IF NOT EXISTS headings ( CREATE TABLE IF NOT EXISTS headings (
id INTEGER PRIMARY KEY AUTOINCREMENT, id INTEGER PRIMARY KEY AUTOINCREMENT,
uuid TEXT NOT NULL UNIQUE,
level INTEGER NOT NULL, level INTEGER NOT NULL,
title TEXT NOT NULL, title TEXT NOT NULL,
parent_id INTEGER, parent_uuid TEXT,
document_id INTEGER NOT NULL, document_id INTEGER NOT NULL,
path TEXT NOT NULL,
headings_order INTEGER,
added_timestamp DATETIME DEFAULT CURRENT_TIMESTAMP, added_timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
updated_timestamp DATETIME, updated_timestamp DATETIME,
deleted_timestamp DATETIME, deleted_timestamp DATETIME,
isDeleted BOOLEAN DEFAULT 0, isDeleted BOOLEAN DEFAULT 0,
FOREIGN KEY (parent_id) REFERENCES headings(id), FOREIGN KEY (parent_uuid) REFERENCES headings(uuid),
FOREIGN KEY (document_id) REFERENCES documents(id) FOREIGN KEY (document_id) REFERENCES documents(id)
); );
CREATE TABLE IF NOT EXISTS body ( CREATE TABLE IF NOT EXISTS body (
id INTEGER PRIMARY KEY AUTOINCREMENT, id INTEGER PRIMARY KEY AUTOINCREMENT,
uuid TEXT NOT NULL UNIQUE,
content TEXT, content TEXT,
heading_id INTEGER NOT NULL, heading_uuid TEXT NOT NULL,
document_id INTEGER NOT NULL, document_id INTEGER NOT NULL,
md5sum TEXT, md5sum TEXT,
added_timestamp DATETIME DEFAULT CURRENT_TIMESTAMP, added_timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
updated_timestamp DATETIME, updated_timestamp DATETIME,
deleted_timestamp DATETIME, deleted_timestamp DATETIME,
isDeleted BOOLEAN DEFAULT 0, isDeleted BOOLEAN DEFAULT 0,
FOREIGN KEY (heading_id) REFERENCES headings(id), FOREIGN KEY (heading_uuid) REFERENCES headings(uuid),
FOREIGN KEY (document_id) REFERENCES documents(id) FOREIGN KEY (document_id) REFERENCES documents(id)
); );
''') ''')
@ -83,12 +87,6 @@ class DocumentManager:
def document_exists(self, document_name: str) -> Optional[Tuple[int]]: def document_exists(self, document_name: str) -> Optional[Tuple[int]]:
""" """
Check if a document exists in the database. Check if a document exists in the database.
Args:
document_name: Name of the document to check.
Returns:
Document ID if it exists, None otherwise.
""" """
self.db_manager.cursor.execute('SELECT id FROM documents WHERE name = ?', (document_name,)) self.db_manager.cursor.execute('SELECT id FROM documents WHERE name = ?', (document_name,))
return self.db_manager.cursor.fetchone() return self.db_manager.cursor.fetchone()
@ -119,7 +117,7 @@ class DocumentManager:
def soft_delete_document(self, document_id: int) -> None: def soft_delete_document(self, document_id: int) -> None:
"""Soft delete a document by marking it as deleted in the database.""" """Soft delete a document by marking it as deleted in the database."""
logging.debug(f"** This now soft deleted, document_id: {document_id}") logging.debug(f"** This document is now soft deleted, document_id: {document_id}")
now: str = datetime.now().isoformat() now: str = datetime.now().isoformat()
self.db_manager.cursor.execute(''' self.db_manager.cursor.execute('''
UPDATE documents SET isDeleted = 1, deleted_timestamp = ? WHERE id = ? UPDATE documents SET isDeleted = 1, deleted_timestamp = ? WHERE id = ?
@ -146,6 +144,7 @@ class MarkdownProcessor:
md = MarkdownIt() md = MarkdownIt()
tokens = md.parse(markdown_text) tokens = md.parse(markdown_text)
print('### Calling update_document_content')
self.update_document_content(tokens, document_id) self.update_document_content(tokens, document_id)
def read_markdown_file(self, file_path: str) -> str: def read_markdown_file(self, file_path: str) -> str:
@ -153,40 +152,44 @@ class MarkdownProcessor:
return file.read() return file.read()
def update_document_content(self, tokens: List, document_id: int) -> None: def update_document_content(self, tokens: List, document_id: int) -> None:
existing_structure = {}
existing_structure = self.get_existing_document_structure(document_id) existing_structure = self.get_existing_document_structure(document_id)
new_structure = self.parse_new_structure(tokens) new_structure = self.parse_new_structure(tokens, document_id, existing_structure)
print('### Calling merge_structures...')
self.merge_structures(existing_structure, new_structure, document_id) self.merge_structures(existing_structure, new_structure, document_id)
def get_existing_document_structure(self, document_id: int) -> Dict: def get_existing_document_structure(self, document_id: int) -> Dict:
structure = {} structure = {}
self.db_manager.cursor.execute(''' self.db_manager.cursor.execute('''
SELECT h.id, h.level, h.title, h.parent_id, b.content SELECT h.uuid, h.level, h.title, h.parent_uuid, h.path, b.content, b.uuid
FROM headings h FROM headings h
LEFT JOIN body b ON h.id = b.heading_id LEFT JOIN body b ON h.uuid = b.heading_uuid
WHERE h.document_id = ? AND h.isDeleted = 0 WHERE h.document_id = ? AND h.isDeleted = 0
ORDER BY h.level, h.id ORDER BY h.level, h.id
''', (document_id,)) ''', (document_id,))
for heading_id, level, title, parent_id, content in self.db_manager.cursor.fetchall(): for heading_uuid, level, title, parent_uuid, path, content, body_uuid in self.db_manager.cursor.fetchall():
structure[heading_id] = { structure[heading_uuid] = {
'id': heading_id, # Add this line to include the id in each node 'uuid': heading_uuid,
'level': level, 'level': level,
'title': title, 'title': title,
'parent_id': parent_id, 'parent_uuid': parent_uuid,
'path': path,
'content': content, 'content': content,
'body_uuid': body_uuid,
'children': [] 'children': []
} }
# Build the tree structure # Build the tree structure
for id, node in structure.items(): for uuid, node in structure.items():
if node['parent_id'] in structure: if node['parent_uuid'] in structure:
structure[node['parent_id']]['children'].append(id) structure[node['parent_uuid']]['children'].append(uuid)
return structure return structure
def parse_new_structure(self, tokens: List) -> Dict: def parse_new_structure(self, tokens: List, document_id: int, existing_structure: Dict) -> Dict:
structure = {} structure = {}
current_heading = None current_heading = None
current_content = [] current_content = []
parent_stack = [{"id": None, "level": 0}] parent_stack = [{"uuid": None, "level": 0, "path": ""}]
for token in tokens: for token in tokens:
if token.type == 'heading_open': if token.type == 'heading_open':
@ -195,21 +198,41 @@ class MarkdownProcessor:
level = int(token.tag.strip('h')) level = int(token.tag.strip('h'))
while parent_stack[-1]['level'] >= level: while parent_stack[-1]['level'] >= level:
parent_stack.pop() parent_stack.pop()
current_heading = str(uuid.uuid4()) # Generate a temporary ID
parent_path = parent_stack[-1]['path']
current_heading = str(uuid.uuid4()) # Always assign a new UUID here, may change later
structure[current_heading] = { structure[current_heading] = {
'uuid': current_heading,
'level': level, 'level': level,
'title': '', 'title': '',
'parent_id': parent_stack[-1]['id'], 'parent_uuid': parent_stack[-1]['uuid'],
'path': f"{parent_path}/{current_heading}" if parent_path else current_heading,
'content': '', 'content': '',
'children': [] 'children': []
} }
parent_stack.append({"id": current_heading, "level": level}) parent_stack.append({"uuid": current_heading, "level": level, "path": structure[current_heading]['path']})
current_content = [] current_content = []
elif token.type == 'heading_close': elif token.type == 'heading_close':
structure[current_heading]['content'] = ''.join(current_content).strip() structure[current_heading]['content'] = ''.join(current_content).strip()
elif token.type == 'inline' and current_heading: elif token.type == 'inline' and current_heading:
if structure[current_heading]['title'] == '': if structure[current_heading]['title'] == '':
# Populate the title
structure[current_heading]['title'] = token.content structure[current_heading]['title'] = token.content
# Now check for existing UUID based on title, level, and parent
existing_uuid = next(
(uuid for uuid, node in existing_structure.items()
if node['title'] == structure[current_heading]['title']
and node['level'] == structure[current_heading]['level']
and node['parent_uuid'] == structure[current_heading]['parent_uuid']), None)
if existing_uuid:
# If found in existing structure, replace the new UUID
structure[existing_uuid] = structure.pop(current_heading)
current_heading = existing_uuid
else: else:
current_content.append(token.content) current_content.append(token.content)
elif current_heading: elif current_heading:
@ -221,17 +244,45 @@ class MarkdownProcessor:
return structure return structure
def merge_structures(self, existing: Dict, new: Dict, document_id: int) -> None: def merge_structures(self, existing: Dict, new: Dict, document_id: int) -> None:
def merge_recursive(existing_node, new_node, parent_id): logging.info(f"Starting merge_structures for document_id: {document_id}")
def merge_recursive(existing_node, new_node, parent_uuid):
logging.debug(f"Processing node: {new_node['title']}")
if not existing_node: if not existing_node:
# This is a new node, insert it logging.debug(f"No existing node found for {new_node['title']}")
heading_id = self.insert_heading(new_node['level'], new_node['title'], parent_id, document_id) # Check if a heading with the same title already exists at this level
self.insert_body(new_node['content'], heading_id, document_id) self.db_manager.cursor.execute('''
for child in new_node['children']: SELECT uuid FROM headings
merge_recursive(None, new[child], heading_id) WHERE title = ? AND level = ? AND parent_uuid = ? AND document_id = ? AND isDeleted = 0
''', (new_node['title'], new_node['level'], parent_uuid, document_id))
existing_uuid = self.db_manager.cursor.fetchone()
if existing_uuid:
heading_uuid = existing_uuid[0]
logging.info(f"Updating existing heading: {new_node['title']} (UUID: {heading_uuid})")
self.update_heading(heading_uuid, new_node['title'], new_node['level'], parent_uuid, new_node['path'])
else: else:
logging.info(f"Inserting new heading: {new_node['title']}")
heading_uuid = self.insert_heading(new_node['level'], new_node['title'], parent_uuid, document_id, new_node['path'])
if new_node['content']:
logging.debug(f"Inserting body content for heading: {new_node['title']}")
body_uuid = self.insert_body(new_node['content'], heading_uuid, document_id)
for child in new_node['children']:
merge_recursive(None, new[child], heading_uuid)
else:
logging.debug(f"Updating existing node: {existing_node['title']}")
# Update existing node # Update existing node
self.update_heading(existing_node['id'], new_node['title'], new_node['level'], parent_id) self.update_heading(existing_node['uuid'], new_node['title'], new_node['level'], parent_uuid, new_node['path'])
self.update_body(existing_node['id'], new_node['content'], document_id) if new_node['content']:
if existing_node['body_uuid']:
logging.debug(f"Updating body content for heading: {existing_node['title']}")
self.update_body(existing_node['body_uuid'], new_node['content'], document_id)
else:
logging.debug(f"Inserting new body content for existing heading: {existing_node['title']}")
self.insert_body(new_node['content'], existing_node['uuid'], document_id)
# Process children # Process children
existing_children = {child['title']: child for child in existing_node['children']} existing_children = {child['title']: child for child in existing_node['children']}
@ -239,89 +290,125 @@ class MarkdownProcessor:
for title, child in new_children.items(): for title, child in new_children.items():
if title in existing_children: if title in existing_children:
merge_recursive(existing_children[title], child, existing_node['id']) merge_recursive(existing_children[title], child, existing_node['uuid'])
else: else:
merge_recursive(None, child, existing_node['id']) merge_recursive(None, child, existing_node['uuid'])
for title, child in existing_children.items(): for title, child in existing_children.items():
if title not in new_children: if title not in new_children:
self.soft_delete_heading(child['id']) logging.info(f"Soft deleting heading: {child['title']}")
self.soft_delete_heading(child['uuid'])
for new_root in new.values(): for new_root in new.values():
existing_root = next((node for node in existing.values() if node['title'] == new_root['title']), None) logging.info(f"Processing root node: {new_root['title']}")
existing_root = next((node for node in existing.values() if node['path'] == new_root['path']), None)
merge_recursive(existing_root, new_root, None) merge_recursive(existing_root, new_root, None)
def insert_heading(self, level: int, title: str, parent_id: Optional[int], document_id: int) -> int: logging.info("Merge structures completed")
self.db_manager.cursor.execute('''
INSERT INTO headings (level, title, parent_id, document_id)
VALUES (?, ?, ?, ?)
''', (level, title, parent_id, document_id))
return self.db_manager.cursor.lastrowid
def update_heading(self, heading_id: int, title: str, level: int, parent_id: Optional[int]) -> None: def insert_heading(self, level: int, title: str, parent_uuid: Optional[str], document_id: int, path: str) -> str:
heading_uuid = str(uuid.uuid4())
self.db_manager.cursor.execute('''
INSERT INTO headings (uuid, level, title, parent_uuid, document_id, path)
VALUES (?, ?, ?, ?, ?, ?)
''', (heading_uuid, level, title, parent_uuid, document_id, path))
return heading_uuid
def update_heading(self, heading_uuid: str, title: str, level: int, parent_uuid: Optional[str], path: str) -> None:
self.db_manager.cursor.execute(''' self.db_manager.cursor.execute('''
UPDATE headings UPDATE headings
SET title = ?, level = ?, parent_id = ?, updated_timestamp = CURRENT_TIMESTAMP SET title = ?, level = ?, parent_uuid = ?, path = ?, updated_timestamp = CURRENT_TIMESTAMP
WHERE id = ? WHERE uuid = ?
''', (title, level, parent_id, heading_id)) ''', (title, level, parent_uuid, path, heading_uuid))
def insert_body(self, content: str, heading_id: int, document_id: int) -> None: def insert_body(self, content: str, heading_uuid: str, document_id: int) -> str:
body_uuid = str(uuid.uuid4())
md5sum = hashlib.md5(content.encode()).hexdigest() md5sum = hashlib.md5(content.encode()).hexdigest()
self.db_manager.cursor.execute(''' print(f"###### Trying to insert body text with md5sum of: {md5sum} to uuid: {body_uuid}, with content: \n{content}\n")
INSERT INTO body (content, heading_id, document_id, md5sum)
VALUES (?, ?, ?, ?)
''', (content, heading_id, document_id, md5sum))
def update_body(self, heading_id: int, content: str, document_id: int) -> None: # Verify input parameters
if not all([content, heading_uuid, document_id]):
raise ValueError("Missing required parameters for insert_body")
try:
# Check if heading_uuid exists
self.db_manager.cursor.execute("SELECT 1 FROM headings WHERE uuid = ?", (heading_uuid,))
if not self.db_manager.cursor.fetchone():
raise ValueError(f"heading_uuid {heading_uuid} does not exist in headings table")
# Check if document_id exists
self.db_manager.cursor.execute("SELECT 1 FROM documents WHERE id = ?", (document_id,))
if not self.db_manager.cursor.fetchone():
raise ValueError(f"document_id {document_id} does not exist in documents table")
# Insert the body
self.db_manager.cursor.execute('''
INSERT INTO body (uuid, content, heading_uuid, document_id, md5sum)
VALUES (?, ?, ?, ?, ?)
''', (body_uuid, content, heading_uuid, document_id, md5sum))
self.db_manager.conn.commit()
print(f"###### Successfully inserted body with uuid: {body_uuid}")
except sqlite3.Error as e:
print(f"An error occurred while inserting body: {e}")
self.db_manager.conn.rollback()
raise
except ValueError as e:
print(f"Validation error: {e}")
raise
return body_uuid
def update_body(self, body_uuid: str, content: str, document_id: int) -> None:
md5sum = hashlib.md5(content.encode()).hexdigest() md5sum = hashlib.md5(content.encode()).hexdigest()
self.db_manager.cursor.execute(''' self.db_manager.cursor.execute('''
UPDATE body UPDATE body
SET content = ?, md5sum = ?, updated_timestamp = CURRENT_TIMESTAMP SET content = ?, md5sum = ?, updated_timestamp = CURRENT_TIMESTAMP
WHERE heading_id = ? AND document_id = ? WHERE uuid = ? AND document_id = ?
''', (content, md5sum, heading_id, document_id)) ''', (content, md5sum, body_uuid, document_id))
def soft_delete_heading(self, heading_id: int) -> None: def soft_delete_heading(self, heading_uuid: str) -> None:
now = datetime.now().isoformat() now = datetime.now().isoformat()
self.db_manager.cursor.execute(''' self.db_manager.cursor.execute('''
UPDATE headings UPDATE headings
SET isDeleted = 1, deleted_timestamp = ? SET isDeleted = 1, deleted_timestamp = ?
WHERE id = ? WHERE uuid = ?
''', (now, heading_id)) ''', (now, heading_uuid))
# Also soft delete associated body content # Also soft delete associated body content
self.db_manager.cursor.execute(''' self.db_manager.cursor.execute('''
UPDATE body UPDATE body
SET isDeleted = 1, deleted_timestamp = ? SET isDeleted = 1, deleted_timestamp = ?
WHERE heading_id = ? WHERE heading_uuid = ?
''', (now, heading_id)) ''', (now, heading_uuid))
class TopicReader: class TopicReader:
"""Reads and retrieves topics from the database.""" """Reads and retrieves topics from the database."""
def __init__(self, db_manager: 'DatabaseManager'): def __init__(self, db_manager: 'DatabaseManager'):
self.db_manager = db_manager self.db_manager = db_manager
def fetch_headings(self) -> List[Tuple[int, str, int, Optional[int]]]: def fetch_headings(self) -> List[Tuple[str, str, int, Optional[str]]]:
self.db_manager.cursor.execute(''' self.db_manager.cursor.execute('''
SELECT id, title, level, parent_id SELECT uuid, title, level, parent_uuid
FROM headings FROM headings
WHERE isDeleted = 0 WHERE isDeleted = 0
ORDER BY level, id ORDER BY level, headings_order
''') ''')
return self.db_manager.cursor.fetchall() return self.db_manager.cursor.fetchall()
def fetch_topic_chain(self, heading_id: int) -> List[Tuple[int, str, int]]: def fetch_topic_chain(self, heading_uuid: str) -> List[Tuple[str, str, int]]:
chain = [] chain = []
current_id = heading_id current_uuid = heading_uuid
while current_id is not None: while current_uuid is not None:
self.db_manager.cursor.execute(''' self.db_manager.cursor.execute('''
SELECT id, title, level, parent_id SELECT uuid, title, level, parent_uuid
FROM headings FROM headings
WHERE id = ? WHERE uuid = ?
''', (current_id,)) ''', (current_uuid,))
result = self.db_manager.cursor.fetchone() result = self.db_manager.cursor.fetchone()
if result: if result:
chain.append((result[0], result[1], result[2])) chain.append((result[0], result[1], result[2]))
current_id = result[3] current_uuid = result[3]
else: else:
break break
@ -331,33 +418,33 @@ class TopicReader:
headings = self.fetch_headings() headings = self.fetch_headings()
result = "Available headings:\n" result = "Available headings:\n"
def build_tree(parent_id, level): def build_tree(parent_uuid, level):
tree = "" tree = ""
for id, title, hlevel, parent in headings: for uuid, title, hlevel, parent in headings:
if parent == parent_id: if parent == parent_uuid:
indent = " " * (hlevel - 1) indent = " " * (hlevel - 1)
tree += f"{indent}- {title}\n" tree += f"{indent}- {title}\n"
tree += build_tree(id, hlevel + 1) tree += build_tree(uuid, hlevel + 1)
return tree return tree
result += build_tree(None, 1) result += build_tree(None, 1)
return result.strip() return result.strip()
def get_topic_content(self, input_title: str) -> Optional[str]: def get_topic_content(self, input_title: str) -> Optional[str]:
heading_id = self.find_closest_heading(input_title) heading_uuid = self.find_closest_heading(input_title)
if heading_id: if heading_uuid:
topic_chain = self.fetch_topic_chain(heading_id) topic_chain = self.fetch_topic_chain(heading_uuid)
result = self.build_full_content(topic_chain[-1][0]) result = self.build_full_content(topic_chain[-1][0])
return result return result
return None return None
def build_full_content(self, heading_id: int, level_offset: int = 0) -> str: def build_full_content(self, heading_uuid: str, level_offset: int = 0) -> str:
self.db_manager.cursor.execute(''' self.db_manager.cursor.execute('''
SELECT h.level, h.title, b.content SELECT h.level, h.title, b.content
FROM headings h FROM headings h
LEFT JOIN body b ON h.id = b.heading_id LEFT JOIN body b ON h.uuid = b.heading_uuid
WHERE h.id = ? AND h.isDeleted = 0 WHERE h.uuid = ? AND h.isDeleted = 0
''', (heading_id,)) ''', (heading_uuid,))
heading = self.db_manager.cursor.fetchone() heading = self.db_manager.cursor.fetchone()
if not heading: if not heading:
return "" return ""
@ -370,17 +457,17 @@ class TopicReader:
# Fetch and process all child headings # Fetch and process all child headings
self.db_manager.cursor.execute(''' self.db_manager.cursor.execute('''
SELECT id FROM headings SELECT uuid FROM headings
WHERE parent_id = ? AND isDeleted = 0 WHERE parent_uuid = ? AND isDeleted = 0
ORDER BY level, id ORDER BY level, headings_order
''', (heading_id,)) ''', (heading_uuid,))
children = self.db_manager.cursor.fetchall() children = self.db_manager.cursor.fetchall()
for child in children: for child in children:
result += self.build_full_content(child[0], level_offset) result += self.build_full_content(child[0], level_offset)
return result return result
def find_closest_heading(self, input_title: str) -> Optional[int]: def find_closest_heading(self, input_title: str) -> Optional[str]:
headings = self.fetch_headings() headings = self.fetch_headings()
if not headings: if not headings:
print("No topics found in the database.") print("No topics found in the database.")
@ -393,9 +480,9 @@ class TopicReader:
print(f"No close matches found for '{input_title}' (Confidence: {confidence})") print(f"No close matches found for '{input_title}' (Confidence: {confidence})")
return None return None
for heading_id, title, _, _ in headings: for heading_uuid, title, _, _ in headings:
if title == closest_match: if title == closest_match:
return heading_id return heading_uuid
return None return None
@ -409,38 +496,54 @@ def compute_file_hash(file_path: str) -> str:
hash_md5.update(chunk) hash_md5.update(chunk)
return hash_md5.hexdigest() return hash_md5.hexdigest()
def generate_calendar(year: int) -> str: def generate_calendar(year: int, db_manager: 'DatabaseManager', document_id: int) -> str:
"""Generate a markdown calendar for the specified year.""" """
Generate a markdown calendar for the specified year.
"""
calendar_markdown = f"# {year}\n\n" calendar_markdown = f"# {year}\n\n"
current_date = datetime.now().date() current_date = datetime.now().date()
# Loop through the months
for month in range(1, 13): for month in range(1, 13):
month_name = datetime(year, month, 1).strftime('%B') month_name = datetime(year, month, 1).strftime('%B')
calendar_markdown += f"## {month:02d} / {month_name}\n\n" calendar_markdown += f"## {month:02d} / {month_name}\n\n"
# Calculate the number of days in the month # Determine the number of days in the month
num_days = (datetime(year, month + 1, 1) - datetime(year, month, 1)).days if month < 12 else (datetime(year + 1, 1, 1) - datetime(year, month, 1)).days if month == 12:
num_days = (datetime(year + 1, 1, 1) - datetime(year, month, 1)).days
else:
num_days = (datetime(year, month + 1, 1) - datetime(year, month, 1)).days
# Generate calendar entries for each day # Create calendar entries for each day in order
for day in range(1, num_days + 1): for day in range(1, num_days + 1):
day_date = datetime(year, month, day).date() day_date = datetime(year, month, day).date()
day_name = day_date.strftime('%a') day_name = day_date.strftime('%a')
# Check if this is the current day and make it bold if so # Add bold styling for the current date
if str(day_date) == str(current_date): if str(day_date) == str(current_date):
calendar_markdown += f"**{day:02d} / {day_name}**\n" calendar_markdown += f"**{day:02d} / {day_name}**\n"
else: else:
calendar_markdown += f"{day:02d} / {day_name}\n" calendar_markdown += f"{day:02d} / {day_name}\n"
calendar_markdown += "\n" # Add a newline after each month calendar_markdown += '\n'
# Now parse the markdown and insert into the database
#parse_and_insert_markdown(calendar_markdown, db_manager, document_id, year)
return calendar_markdown return calendar_markdown
def convert_to_html(markdown_content: str) -> str: def convert_to_html(markdown_content: str, heading_uuid: Optional[str] = None) -> str:
""" """
Convert Markdown content to HTML. Convert Markdown content (or specific section) to HTML.
""" """
md = MarkdownIt() md = MarkdownIt()
if heading_uuid:
# Fetch content for a specific heading and its sub-headings from the database
# Example SQL to get heading content based on UUID:
# SELECT title, content FROM headings WHERE uuid = ?
pass
html_content = md.render(markdown_content) html_content = md.render(markdown_content)
# Wrap the content in a basic HTML structure # Wrap the content in a basic HTML structure
@ -450,7 +553,7 @@ def convert_to_html(markdown_content: str) -> str:
<head> <head>
<meta charset="UTF-8"> <meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0"> <meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Calendar</title> <title>Document</title>
<style> <style>
body {{ font-family: Arial, sans-serif; line-height: 1.6; padding: 20px; }} body {{ font-family: Arial, sans-serif; line-height: 1.6; padding: 20px; }}
h1, h2, h3, h4, h5, h6 {{ margin-top: 24px; margin-bottom: 16px; }} h1, h2, h3, h4, h5, h6 {{ margin-top: 24px; margin-bottom: 16px; }}
@ -467,65 +570,94 @@ def convert_to_html(markdown_content: str) -> str:
return html_document return html_document
def bootstrap_calendar(year: int, db_manager: DatabaseManager, markdown_file: str): def bootstrap_calendar(year: int, db_manager: DatabaseManager, markdown_file: str):
"""Generate and store a full year's markdown calendar in the database.""" """Generate and store a full year's markdown calendar in the database using UUIDs."""
calendar_content = generate_calendar(year) document_manager = DocumentManager(db_manager)
markdown_processor = MarkdownProcessor(db_manager)
# Write the calendar to the specified markdown file print('## Generating calendar')
document_id = document_manager.create_document(f"{year} Calendar", markdown_file)
calendar_content = generate_calendar(year, db_manager, document_id=document_id)
print('## Writing the newly created calendar')
with open(markdown_file, 'w', encoding='utf-8') as f: with open(markdown_file, 'w', encoding='utf-8') as f:
f.write(calendar_content) f.write(calendar_content)
# Now use the DocumentManager and MarkdownProcessor to read this file into the database # Process the markdown to update or store in the database
document_manager = DocumentManager(db_manager)
document_id = document_manager.create_document(os.path.basename(markdown_file), markdown_file)
markdown_processor = MarkdownProcessor(db_manager)
markdown_processor.process_markdown(markdown_file, document_id) markdown_processor.process_markdown(markdown_file, document_id)
# Calculate and store the hash
current_file_hash = compute_file_hash(markdown_file)
document_manager.update_document_hash(document_id, current_file_hash)
print(f"Calendar for year {year} has been generated and stored in the database.") print(f"Calendar for year {year} has been generated and stored in the database.")
def main(): def main():
""" """
This script processes a markdown file, updates an SQLite database, This script processes a markdown file, updates an SQLite database,
and optionally selects a topic based on user input. and optionally selects a topic based on user input.
Initializes managers for database and markdown handling. Updates documents based on
MD5 hash changes, and if a topic is provided, retrieves and writes its content
to the markdown file.
Args:
-m, --markdown: Path to markdown file (default: 'calendar.md').
-d, --database: Path to SQLite database file (default: 'markdown.db').
topic_title: Optional topic for content selection (fuzzy matching enabled).
--bootstrap: If provided, generates markdown calendar for the current year and loads it to the database.
--ls: If provided, lists all available headings.
--html: If provided, will produce {filename}.html file along the markdown file.
""" """
# Set up command-line argument parsing
parser = argparse.ArgumentParser(description='Process markdown file and optionally select a topic.') parser = argparse.ArgumentParser(description='Process markdown file and optionally select a topic.')
parser.add_argument('-m', '--markdown', type=str, default='calendar.md', help='Input/output markdown file (default: calendar.md)') parser.add_argument('-m', '--markdown', type=str, default='calendar.md', help='Input/output markdown file (default: calendar.md)')
parser.add_argument('-d', '--database', type=str, default='markdown.db', help='SQLite database file (default: markdown.db)') parser.add_argument('-d', '--database', type=str, default='markdown.db', help='SQLite database file (default: markdown.db)')
parser.add_argument('topic_title', nargs='?', type=str, help='Topic title to select (fuzzy matching enabled)') parser.add_argument('topic_title', nargs='?', type=str, help='Topic title to select (fuzzy matching enabled)')
parser.add_argument('--bootstrap', action='store_true', help='Generate markdown calendar for the current year and load it to the database.') parser.add_argument('--bootstrap', action='store_true', help='Generate markdown calendar for the current year and load it to the database.')
parser.add_argument('--ls', action='store_true', help='List all available headings.') parser.add_argument('--ls', action='store_true', help='List all available headings.')
parser.add_argument('--html', action='store_true', help='Generate an HTML version of the output') parser.add_argument('--html', action='store_true', help='Generate an HTML version of the output.')
parser.add_argument('--uuid', type=str, help='Specify a UUID to retrieve content.')
parser.add_argument('--debug', action='store_true', help='Enable debug printing') parser.add_argument('--debug', action='store_true', help='Enable debug printing')
args = parser.parse_args() args = parser.parse_args()
# Set up logging # Setup basic logging
logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)
# Check for markdown file presence
# Use the provided or default file paths if not os.path.exists(args.markdown) and not args.bootstrap:
markdown_file = args.markdown print(f"Error: Markdown file '{args.markdown}' not found. Use --bootstrap to create a new calendar.")
database_file = args.database
# Initialize manager objects for database operations
db_manager = DatabaseManager(database_file)
if args.bootstrap:
bootstrap_calendar(datetime.now().year, db_manager, markdown_file)
db_manager.close() db_manager.close()
return return
# Check for databse file presence
if not os.path.exists(args.database) and not args.bootstrap:
print(f"Error: Database file '{args.database}' not found. Use --bootstrap to create a new calendar.")
db_manager.close()
return
# Initialize manager objects
db_manager = DatabaseManager(args.database)
# Initialize the MarkdownProcessor with the db_manager
markdown_processor = MarkdownProcessor(db_manager)
if args.bootstrap:
print('## Running calendar bootstrap')
bootstrap_calendar(datetime.now().year, db_manager, args.markdown)
db_manager.close()
return
document_manager = DocumentManager(db_manager)
# Get the base name of the markdown file (without path)
document_name = os.path.basename(args.markdown)
# Check if file with same name has been uploaded before
document_id = document_manager.document_exists(document_name)
# Compute the current MD5 hash of the markdown file
current_file_hash = compute_file_hash(args.markdown)
if document_id:
# Check if current_file_hash is already in database
db_manager.cursor.execute("SELECT md5sum FROM documents WHERE id = ?", (document_id[0],))
result = db_manager.cursor.fetchone()
if result is None or result[0] != current_file_hash:
print("File has changed or hash not found. Processing...")
markdown_processor.process_markdown(args.markdown, document_id[0])
document_manager.update_document_hash(document_id[0], current_file_hash)
else:
print("No changes detected in the file.")
else:
print("Document does not exist, creating new entry.")
new_document_id = document_manager.create_document(document_name, args.markdown)
markdown_processor.process_markdown(args.markdown, new_document_id)
document_manager.update_document_hash(new_document_id, current_file_hash)
if args.ls: if args.ls:
topic_reader = TopicReader(db_manager) topic_reader = TopicReader(db_manager)
@ -533,90 +665,29 @@ def main():
db_manager.close() db_manager.close()
return return
# Check if the markdown file exists # Topic or UUID-based content retrieval
if not os.path.exists(markdown_file): if args.topic_title or args.uuid:
print(f"Error: Markdown file '{markdown_file}' not found. Use --bootstrap to create a new calendar.")
db_manager.close()
return
document_manager = DocumentManager(db_manager)
markdown_processor = MarkdownProcessor(db_manager)
# Get the base name of the markdown file (without path)
document_name = os.path.basename(markdown_file)
# Check if the document already exists in the database
document = db_manager.cursor.execute('SELECT id, file_path, md5sum, updated_timestamp FROM documents WHERE name = ?', (document_name,)).fetchone()
# Compute the current MD5 hash of the markdown file
current_file_hash = compute_file_hash(markdown_file)
if document:
# If the document exists in the database
document_id, stored_file_path, stored_md5sum, last_updated = document
if stored_file_path != markdown_file:
print(f"Updating file path for '{document_name}' in the database...")
document_manager.update_document(document_id, file_path=markdown_file)
if stored_md5sum != current_file_hash:
# If the file has changed since last update
print(f"File '{document_name}' has changed. Updating the database...")
document_manager.update_document_hash(document_id, current_file_hash)
markdown_processor.process_markdown(markdown_file, document_id)
else:
# If the file hasn't changed
print(f"File '{document_name}' has not changed. Skipping update.")
else:
# If the document doesn't exist in the database
print(f"Document '{document_name}' not found in the database. Adding new entry...")
document_id = document_manager.create_document(document_name, markdown_file)
document_manager.update_document_hash(document_id, current_file_hash)
markdown_processor.process_markdown(markdown_file, document_id)
# Check if a topic title argument is provided
if args.topic_title:
# Initialize TopicReader
topic_reader = TopicReader(db_manager) topic_reader = TopicReader(db_manager)
# Retrieve the content for the specified topic if args.uuid:
result = topic_reader.get_topic_content(args.topic_title) content = topic_reader.build_full_content(args.uuid)
else:
content = topic_reader.get_topic_content(args.topic_title)
if result: if content:
# If content is found, write it back to the original markdown file # Write the selected content to the markdown file
with open(markdown_file, 'w', encoding='utf-8') as file: with open(args.markdown, 'w', encoding='utf-8') as file:
file.write(result) file.write(content)
file.write('\n') file.write('\n')
print(f"Selected topic and subtopics written to {markdown_file}") print(f"Selected content written to {args.markdown}")
# Generate HTML if --html option is specified # Optionally convert to HTML
if args.html: if args.html:
html_file = os.path.splitext(markdown_file)[0] + '.html' html_file = f"{args.markdown}.html"
html_content = convert_to_html(result)
with open(html_file, 'w', encoding='utf-8') as file: with open(html_file, 'w', encoding='utf-8') as file:
file.write(html_content) file.write(convert_to_html(content))
print(f"HTML version written to {html_file}") print(f"HTML version written to {html_file}")
# Update the document hash in the database
new_file_hash = compute_file_hash(markdown_file)
document_manager.update_document_hash(document_id, new_file_hash)
else:
# If no content is found
print("No result to write. The original file remains unchanged.")
else:
print("No topic title provided. The database has been updated/added without modifying the file.")
# Generate HTML for the entire document if --html option is specified
if args.html:
with open(markdown_file, 'r', encoding='utf-8') as file:
markdown_content = file.read()
html_file = os.path.splitext(markdown_file)[0] + '.html'
html_content = convert_to_html(markdown_content)
with open(html_file, 'w', encoding='utf-8') as file:
file.write(html_content)
print(f"HTML version of the entire document written to {html_file}")
# Close the database connection
db_manager.close() db_manager.close()
if __name__ == '__main__': if __name__ == '__main__':