Trying to make the data handling more robust. Adding uuid identifiers and stuff..

This commit is contained in:
kalzu rekku 2024-10-05 14:56:27 +03:00
parent 3d35990b3e
commit 4dfc81bd44

View File

@ -43,29 +43,33 @@ class DatabaseManager:
CREATE TABLE IF NOT EXISTS headings ( CREATE TABLE IF NOT EXISTS headings (
id INTEGER PRIMARY KEY AUTOINCREMENT, id INTEGER PRIMARY KEY AUTOINCREMENT,
uuid TEXT NOT NULL UNIQUE,
level INTEGER NOT NULL, level INTEGER NOT NULL,
title TEXT NOT NULL, title TEXT NOT NULL,
parent_id INTEGER, parent_uuid TEXT,
document_id INTEGER NOT NULL, document_id INTEGER NOT NULL,
path TEXT NOT NULL,
headings_order INTEGER,
added_timestamp DATETIME DEFAULT CURRENT_TIMESTAMP, added_timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
updated_timestamp DATETIME, updated_timestamp DATETIME,
deleted_timestamp DATETIME, deleted_timestamp DATETIME,
isDeleted BOOLEAN DEFAULT 0, isDeleted BOOLEAN DEFAULT 0,
FOREIGN KEY (parent_id) REFERENCES headings(id), FOREIGN KEY (parent_uuid) REFERENCES headings(uuid),
FOREIGN KEY (document_id) REFERENCES documents(id) FOREIGN KEY (document_id) REFERENCES documents(id)
); );
CREATE TABLE IF NOT EXISTS body ( CREATE TABLE IF NOT EXISTS body (
id INTEGER PRIMARY KEY AUTOINCREMENT, id INTEGER PRIMARY KEY AUTOINCREMENT,
uuid TEXT NOT NULL UNIQUE,
content TEXT, content TEXT,
heading_id INTEGER NOT NULL, heading_uuid TEXT NOT NULL,
document_id INTEGER NOT NULL, document_id INTEGER NOT NULL,
md5sum TEXT, md5sum TEXT,
added_timestamp DATETIME DEFAULT CURRENT_TIMESTAMP, added_timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
updated_timestamp DATETIME, updated_timestamp DATETIME,
deleted_timestamp DATETIME, deleted_timestamp DATETIME,
isDeleted BOOLEAN DEFAULT 0, isDeleted BOOLEAN DEFAULT 0,
FOREIGN KEY (heading_id) REFERENCES headings(id), FOREIGN KEY (heading_uuid) REFERENCES headings(uuid),
FOREIGN KEY (document_id) REFERENCES documents(id) FOREIGN KEY (document_id) REFERENCES documents(id)
); );
''') ''')
@ -83,12 +87,6 @@ class DocumentManager:
def document_exists(self, document_name: str) -> Optional[Tuple[int]]: def document_exists(self, document_name: str) -> Optional[Tuple[int]]:
""" """
Check if a document exists in the database. Check if a document exists in the database.
Args:
document_name: Name of the document to check.
Returns:
Document ID if it exists, None otherwise.
""" """
self.db_manager.cursor.execute('SELECT id FROM documents WHERE name = ?', (document_name,)) self.db_manager.cursor.execute('SELECT id FROM documents WHERE name = ?', (document_name,))
return self.db_manager.cursor.fetchone() return self.db_manager.cursor.fetchone()
@ -119,7 +117,7 @@ class DocumentManager:
def soft_delete_document(self, document_id: int) -> None: def soft_delete_document(self, document_id: int) -> None:
"""Soft delete a document by marking it as deleted in the database.""" """Soft delete a document by marking it as deleted in the database."""
logging.debug(f"** This now soft deleted, document_id: {document_id}") logging.debug(f"** This document is now soft deleted, document_id: {document_id}")
now: str = datetime.now().isoformat() now: str = datetime.now().isoformat()
self.db_manager.cursor.execute(''' self.db_manager.cursor.execute('''
UPDATE documents SET isDeleted = 1, deleted_timestamp = ? WHERE id = ? UPDATE documents SET isDeleted = 1, deleted_timestamp = ? WHERE id = ?
@ -146,6 +144,7 @@ class MarkdownProcessor:
md = MarkdownIt() md = MarkdownIt()
tokens = md.parse(markdown_text) tokens = md.parse(markdown_text)
print('### Calling update_document_content')
self.update_document_content(tokens, document_id) self.update_document_content(tokens, document_id)
def read_markdown_file(self, file_path: str) -> str: def read_markdown_file(self, file_path: str) -> str:
@ -154,39 +153,42 @@ class MarkdownProcessor:
def update_document_content(self, tokens: List, document_id: int) -> None: def update_document_content(self, tokens: List, document_id: int) -> None:
existing_structure = self.get_existing_document_structure(document_id) existing_structure = self.get_existing_document_structure(document_id)
new_structure = self.parse_new_structure(tokens) new_structure = self.parse_new_structure(tokens, document_id)
print('### Calling merg_structures...')
self.merge_structures(existing_structure, new_structure, document_id) self.merge_structures(existing_structure, new_structure, document_id)
def get_existing_document_structure(self, document_id: int) -> Dict: def get_existing_document_structure(self, document_id: int) -> Dict:
structure = {} structure = {}
self.db_manager.cursor.execute(''' self.db_manager.cursor.execute('''
SELECT h.id, h.level, h.title, h.parent_id, b.content SELECT h.uuid, h.level, h.title, h.parent_uuid, h.path, b.content, b.uuid
FROM headings h FROM headings h
LEFT JOIN body b ON h.id = b.heading_id LEFT JOIN body b ON h.uuid = b.heading_uuid
WHERE h.document_id = ? AND h.isDeleted = 0 WHERE h.document_id = ? AND h.isDeleted = 0
ORDER BY h.level, h.id ORDER BY h.level, h.id
''', (document_id,)) ''', (document_id,))
for heading_id, level, title, parent_id, content in self.db_manager.cursor.fetchall(): for heading_uuid, level, title, parent_uuid, path, content, body_uuid in self.db_manager.cursor.fetchall():
structure[heading_id] = { structure[heading_uuid] = {
'id': heading_id, # Add this line to include the id in each node 'uuid': heading_uuid,
'level': level, 'level': level,
'title': title, 'title': title,
'parent_id': parent_id, 'parent_uuid': parent_uuid,
'path': path,
'content': content, 'content': content,
'body_uuid': body_uuid,
'children': [] 'children': []
} }
# Build the tree structure # Build the tree structure
for id, node in structure.items(): for uuid, node in structure.items():
if node['parent_id'] in structure: if node['parent_uuid'] in structure:
structure[node['parent_id']]['children'].append(id) structure[node['parent_uuid']]['children'].append(uuid)
return structure return structure
def parse_new_structure(self, tokens: List) -> Dict: def parse_new_structure(self, tokens: List, document_id: int) -> Dict:
structure = {} structure = {}
current_heading = None current_heading = None
current_content = [] current_content = []
parent_stack = [{"id": None, "level": 0}] parent_stack = [{"uuid": None, "level": 0, "path": ""}]
for token in tokens: for token in tokens:
if token.type == 'heading_open': if token.type == 'heading_open':
@ -195,15 +197,18 @@ class MarkdownProcessor:
level = int(token.tag.strip('h')) level = int(token.tag.strip('h'))
while parent_stack[-1]['level'] >= level: while parent_stack[-1]['level'] >= level:
parent_stack.pop() parent_stack.pop()
current_heading = str(uuid.uuid4()) # Generate a temporary ID current_heading = str(uuid.uuid4())
parent_path = parent_stack[-1]['path']
structure[current_heading] = { structure[current_heading] = {
'uuid': current_heading,
'level': level, 'level': level,
'title': '', 'title': '',
'parent_id': parent_stack[-1]['id'], 'parent_uuid': parent_stack[-1]['uuid'],
'path': f"{parent_path}/{current_heading}" if parent_path else current_heading,
'content': '', 'content': '',
'children': [] 'children': []
} }
parent_stack.append({"id": current_heading, "level": level}) parent_stack.append({"uuid": current_heading, "level": level, "path": structure[current_heading]['path']})
current_content = [] current_content = []
elif token.type == 'heading_close': elif token.type == 'heading_close':
structure[current_heading]['content'] = ''.join(current_content).strip() structure[current_heading]['content'] = ''.join(current_content).strip()
@ -221,17 +226,17 @@ class MarkdownProcessor:
return structure return structure
def merge_structures(self, existing: Dict, new: Dict, document_id: int) -> None: def merge_structures(self, existing: Dict, new: Dict, document_id: int) -> None:
def merge_recursive(existing_node, new_node, parent_id): def merge_recursive(existing_node, new_node, parent_uuid):
if not existing_node: if not existing_node:
# This is a new node, insert it # This is a new node, insert it
heading_id = self.insert_heading(new_node['level'], new_node['title'], parent_id, document_id) heading_uuid = self.insert_heading(new_node['level'], new_node['title'], parent_uuid, document_id, new_node['path'])
self.insert_body(new_node['content'], heading_id, document_id) body_uuid = self.insert_body(new_node['content'], heading_uuid, document_id)
for child in new_node['children']: for child in new_node['children']:
merge_recursive(None, new[child], heading_id) merge_recursive(None, new[child], heading_uuid)
else: else:
# Update existing node # Update existing node
self.update_heading(existing_node['id'], new_node['title'], new_node['level'], parent_id) self.update_heading(existing_node['uuid'], new_node['title'], new_node['level'], parent_uuid, new_node['path'])
self.update_body(existing_node['id'], new_node['content'], document_id) self.update_body(existing_node['body_uuid'], new_node['content'], document_id)
# Process children # Process children
existing_children = {child['title']: child for child in existing_node['children']} existing_children = {child['title']: child for child in existing_node['children']}
@ -239,89 +244,92 @@ class MarkdownProcessor:
for title, child in new_children.items(): for title, child in new_children.items():
if title in existing_children: if title in existing_children:
merge_recursive(existing_children[title], child, existing_node['id']) merge_recursive(existing_children[title], child, existing_node['uuid'])
else: else:
merge_recursive(None, child, existing_node['id']) merge_recursive(None, child, existing_node['uuid'])
for title, child in existing_children.items(): for title, child in existing_children.items():
if title not in new_children: if title not in new_children:
self.soft_delete_heading(child['id']) self.soft_delete_heading(child['uuid'])
for new_root in new.values(): for new_root in new.values():
existing_root = next((node for node in existing.values() if node['title'] == new_root['title']), None) existing_root = next((node for node in existing.values() if node['path'] == new_root['path']), None)
merge_recursive(existing_root, new_root, None) merge_recursive(existing_root, new_root, None)
def insert_heading(self, level: int, title: str, parent_id: Optional[int], document_id: int) -> int: def insert_heading(self, level: int, title: str, parent_uuid: Optional[str], document_id: int, path: str) -> str:
heading_uuid = str(uuid.uuid4())
self.db_manager.cursor.execute(''' self.db_manager.cursor.execute('''
INSERT INTO headings (level, title, parent_id, document_id) INSERT INTO headings (uuid, level, title, parent_uuid, document_id, path)
VALUES (?, ?, ?, ?) VALUES (?, ?, ?, ?, ?, ?)
''', (level, title, parent_id, document_id)) ''', (heading_uuid, level, title, parent_uuid, document_id, path))
return self.db_manager.cursor.lastrowid return heading_uuid
def update_heading(self, heading_id: int, title: str, level: int, parent_id: Optional[int]) -> None: def update_heading(self, heading_uuid: str, title: str, level: int, parent_uuid: Optional[str], path: str) -> None:
self.db_manager.cursor.execute(''' self.db_manager.cursor.execute('''
UPDATE headings UPDATE headings
SET title = ?, level = ?, parent_id = ?, updated_timestamp = CURRENT_TIMESTAMP SET title = ?, level = ?, parent_uuid = ?, path = ?, updated_timestamp = CURRENT_TIMESTAMP
WHERE id = ? WHERE uuid = ?
''', (title, level, parent_id, heading_id)) ''', (title, level, parent_uuid, path, heading_uuid))
def insert_body(self, content: str, heading_id: int, document_id: int) -> None: def insert_body(self, content: str, heading_uuid: str, document_id: int) -> str:
body_uuid = str(uuid.uuid4())
md5sum = hashlib.md5(content.encode()).hexdigest() md5sum = hashlib.md5(content.encode()).hexdigest()
self.db_manager.cursor.execute(''' self.db_manager.cursor.execute('''
INSERT INTO body (content, heading_id, document_id, md5sum) INSERT INTO body (uuid, content, heading_uuid, document_id, md5sum)
VALUES (?, ?, ?, ?) VALUES (?, ?, ?, ?, ?)
''', (content, heading_id, document_id, md5sum)) ''', (body_uuid, content, heading_uuid, document_id, md5sum))
return body_uuid
def update_body(self, heading_id: int, content: str, document_id: int) -> None: def update_body(self, body_uuid: str, content: str, document_id: int) -> None:
md5sum = hashlib.md5(content.encode()).hexdigest() md5sum = hashlib.md5(content.encode()).hexdigest()
self.db_manager.cursor.execute(''' self.db_manager.cursor.execute('''
UPDATE body UPDATE body
SET content = ?, md5sum = ?, updated_timestamp = CURRENT_TIMESTAMP SET content = ?, md5sum = ?, updated_timestamp = CURRENT_TIMESTAMP
WHERE heading_id = ? AND document_id = ? WHERE uuid = ? AND document_id = ?
''', (content, md5sum, heading_id, document_id)) ''', (content, md5sum, body_uuid, document_id))
def soft_delete_heading(self, heading_id: int) -> None: def soft_delete_heading(self, heading_uuid: str) -> None:
now = datetime.now().isoformat() now = datetime.now().isoformat()
self.db_manager.cursor.execute(''' self.db_manager.cursor.execute('''
UPDATE headings UPDATE headings
SET isDeleted = 1, deleted_timestamp = ? SET isDeleted = 1, deleted_timestamp = ?
WHERE id = ? WHERE uuid = ?
''', (now, heading_id)) ''', (now, heading_uuid))
# Also soft delete associated body content # Also soft delete associated body content
self.db_manager.cursor.execute(''' self.db_manager.cursor.execute('''
UPDATE body UPDATE body
SET isDeleted = 1, deleted_timestamp = ? SET isDeleted = 1, deleted_timestamp = ?
WHERE heading_id = ? WHERE heading_uuid = ?
''', (now, heading_id)) ''', (now, heading_uuid))
class TopicReader: class TopicReader:
"""Reads and retrieves topics from the database.""" """Reads and retrieves topics from the database."""
def __init__(self, db_manager: 'DatabaseManager'): def __init__(self, db_manager: 'DatabaseManager'):
self.db_manager = db_manager self.db_manager = db_manager
def fetch_headings(self) -> List[Tuple[int, str, int, Optional[int]]]: def fetch_headings(self) -> List[Tuple[str, str, int, Optional[str]]]:
self.db_manager.cursor.execute(''' self.db_manager.cursor.execute('''
SELECT id, title, level, parent_id SELECT uuid, title, level, parent_uuid
FROM headings FROM headings
WHERE isDeleted = 0 WHERE isDeleted = 0
ORDER BY level, id ORDER BY level, headings_order
''') ''')
return self.db_manager.cursor.fetchall() return self.db_manager.cursor.fetchall()
def fetch_topic_chain(self, heading_id: int) -> List[Tuple[int, str, int]]: def fetch_topic_chain(self, heading_uuid: str) -> List[Tuple[str, str, int]]:
chain = [] chain = []
current_id = heading_id current_uuid = heading_uuid
while current_id is not None: while current_uuid is not None:
self.db_manager.cursor.execute(''' self.db_manager.cursor.execute('''
SELECT id, title, level, parent_id SELECT uuid, title, level, parent_uuid
FROM headings FROM headings
WHERE id = ? WHERE uuid = ?
''', (current_id,)) ''', (current_uuid,))
result = self.db_manager.cursor.fetchone() result = self.db_manager.cursor.fetchone()
if result: if result:
chain.append((result[0], result[1], result[2])) chain.append((result[0], result[1], result[2]))
current_id = result[3] current_uuid = result[3]
else: else:
break break
@ -331,33 +339,33 @@ class TopicReader:
headings = self.fetch_headings() headings = self.fetch_headings()
result = "Available headings:\n" result = "Available headings:\n"
def build_tree(parent_id, level): def build_tree(parent_uuid, level):
tree = "" tree = ""
for id, title, hlevel, parent in headings: for uuid, title, hlevel, parent in headings:
if parent == parent_id: if parent == parent_uuid:
indent = " " * (hlevel - 1) indent = " " * (hlevel - 1)
tree += f"{indent}- {title}\n" tree += f"{indent}- {title}\n"
tree += build_tree(id, hlevel + 1) tree += build_tree(uuid, hlevel + 1)
return tree return tree
result += build_tree(None, 1) result += build_tree(None, 1)
return result.strip() return result.strip()
def get_topic_content(self, input_title: str) -> Optional[str]: def get_topic_content(self, input_title: str) -> Optional[str]:
heading_id = self.find_closest_heading(input_title) heading_uuid = self.find_closest_heading(input_title)
if heading_id: if heading_uuid:
topic_chain = self.fetch_topic_chain(heading_id) topic_chain = self.fetch_topic_chain(heading_uuid)
result = self.build_full_content(topic_chain[-1][0]) result = self.build_full_content(topic_chain[-1][0])
return result return result
return None return None
def build_full_content(self, heading_id: int, level_offset: int = 0) -> str: def build_full_content(self, heading_uuid: str, level_offset: int = 0) -> str:
self.db_manager.cursor.execute(''' self.db_manager.cursor.execute('''
SELECT h.level, h.title, b.content SELECT h.level, h.title, b.content
FROM headings h FROM headings h
LEFT JOIN body b ON h.id = b.heading_id LEFT JOIN body b ON h.uuid = b.heading_uuid
WHERE h.id = ? AND h.isDeleted = 0 WHERE h.uuid = ? AND h.isDeleted = 0
''', (heading_id,)) ''', (heading_uuid,))
heading = self.db_manager.cursor.fetchone() heading = self.db_manager.cursor.fetchone()
if not heading: if not heading:
return "" return ""
@ -370,17 +378,17 @@ class TopicReader:
# Fetch and process all child headings # Fetch and process all child headings
self.db_manager.cursor.execute(''' self.db_manager.cursor.execute('''
SELECT id FROM headings SELECT uuid FROM headings
WHERE parent_id = ? AND isDeleted = 0 WHERE parent_uuid = ? AND isDeleted = 0
ORDER BY level, id ORDER BY level, headings_order
''', (heading_id,)) ''', (heading_uuid,))
children = self.db_manager.cursor.fetchall() children = self.db_manager.cursor.fetchall()
for child in children: for child in children:
result += self.build_full_content(child[0], level_offset) result += self.build_full_content(child[0], level_offset)
return result return result
def find_closest_heading(self, input_title: str) -> Optional[int]: def find_closest_heading(self, input_title: str) -> Optional[str]:
headings = self.fetch_headings() headings = self.fetch_headings()
if not headings: if not headings:
print("No topics found in the database.") print("No topics found in the database.")
@ -393,9 +401,9 @@ class TopicReader:
print(f"No close matches found for '{input_title}' (Confidence: {confidence})") print(f"No close matches found for '{input_title}' (Confidence: {confidence})")
return None return None
for heading_id, title, _, _ in headings: for heading_uuid, title, _, _ in headings:
if title == closest_match: if title == closest_match:
return heading_id return heading_uuid
return None return None
@ -408,41 +416,57 @@ def compute_file_hash(file_path: str) -> str:
for chunk in iter(lambda: f.read(4096), b""): for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk) hash_md5.update(chunk)
return hash_md5.hexdigest() return hash_md5.hexdigest()
def generate_calendar(year: int) -> str: def generate_calendar(year: int, db_manager: 'DatabaseManager', document_id: int) -> str:
"""Generate a markdown calendar for the specified year.""" """
Generate a markdown calendar for the specified year.
"""
calendar_markdown = f"# {year}\n\n" calendar_markdown = f"# {year}\n\n"
current_date = datetime.now().date() current_date = datetime.now().date()
# Loop through the months
for month in range(1, 13): for month in range(1, 13):
month_name = datetime(year, month, 1).strftime('%B') month_name = datetime(year, month, 1).strftime('%B')
calendar_markdown += f"## {month:02d} / {month_name}\n\n" calendar_markdown += f"## {month:02d} / {month_name}\n\n"
# Calculate the number of days in the month # Determine the number of days in the month
num_days = (datetime(year, month + 1, 1) - datetime(year, month, 1)).days if month < 12 else (datetime(year + 1, 1, 1) - datetime(year, month, 1)).days if month == 12:
num_days = (datetime(year + 1, 1, 1) - datetime(year, month, 1)).days
# Generate calendar entries for each day else:
num_days = (datetime(year, month + 1, 1) - datetime(year, month, 1)).days
# Create calendar entries for each day in order
for day in range(1, num_days + 1): for day in range(1, num_days + 1):
day_date = datetime(year, month, day).date() day_date = datetime(year, month, day).date()
day_name = day_date.strftime('%a') day_name = day_date.strftime('%a')
# Check if this is the current day and make it bold if so # Add bold styling for the current date
if str(day_date) == str(current_date): if str(day_date) == str(current_date):
calendar_markdown += f"**{day:02d} / {day_name}**\n" calendar_markdown += f"**{day:02d} / {day_name}**\n"
else: else:
calendar_markdown += f"{day:02d} / {day_name}\n" calendar_markdown += f"{day:02d} / {day_name}\n"
calendar_markdown += "\n" # Add a newline after each month calendar_markdown += '\n'
# Now parse the markdown and insert into the database
#parse_and_insert_markdown(calendar_markdown, db_manager, document_id, year)
return calendar_markdown return calendar_markdown
def convert_to_html(markdown_content: str) -> str: def convert_to_html(markdown_content: str, heading_uuid: Optional[str] = None) -> str:
""" """
Convert Markdown content to HTML. Convert Markdown content (or specific section) to HTML.
""" """
md = MarkdownIt() md = MarkdownIt()
html_content = md.render(markdown_content)
if heading_uuid:
# Fetch content for a specific heading and its sub-headings from the database
# Example SQL to get heading content based on UUID:
# SELECT title, content FROM headings WHERE uuid = ?
pass
html_content = md.render(markdown_content)
# Wrap the content in a basic HTML structure # Wrap the content in a basic HTML structure
html_document = f""" html_document = f"""
<!DOCTYPE html> <!DOCTYPE html>
@ -450,7 +474,7 @@ def convert_to_html(markdown_content: str) -> str:
<head> <head>
<meta charset="UTF-8"> <meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0"> <meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Calendar</title> <title>Document</title>
<style> <style>
body {{ font-family: Arial, sans-serif; line-height: 1.6; padding: 20px; }} body {{ font-family: Arial, sans-serif; line-height: 1.6; padding: 20px; }}
h1, h2, h3, h4, h5, h6 {{ margin-top: 24px; margin-bottom: 16px; }} h1, h2, h3, h4, h5, h6 {{ margin-top: 24px; margin-bottom: 16px; }}
@ -464,160 +488,99 @@ def convert_to_html(markdown_content: str) -> str:
</body> </body>
</html> </html>
""" """
return html_document return html_document
def bootstrap_calendar(year: int, db_manager: DatabaseManager, markdown_file: str): def bootstrap_calendar(year: int, db_manager: DatabaseManager, markdown_file: str):
"""Generate and store a full year's markdown calendar in the database.""" """Generate and store a full year's markdown calendar in the database using UUIDs."""
calendar_content = generate_calendar(year) document_manager = DocumentManager(db_manager)
markdown_processor = MarkdownProcessor(db_manager)
# Write the calendar to the specified markdown file
# Generate calendar markdown and insert into the database
print('## Generating calendar')
calendar_content = generate_calendar(year, db_manager, document_id=document_manager.create_document(f"{year} Calendar", markdown_file))
# Write the calendar to the markdown file
print('## Reading the newly created calendar')
with open(markdown_file, 'w', encoding='utf-8') as f: with open(markdown_file, 'w', encoding='utf-8') as f:
f.write(calendar_content) f.write(calendar_content)
# Now use the DocumentManager and MarkdownProcessor to read this file into the database # Process the markdown to update or store in the database
document_manager = DocumentManager(db_manager) markdown_processor.process_markdown(markdown_file, document_manager.document_exists(f"{year} Calendar")[0])
document_id = document_manager.create_document(os.path.basename(markdown_file), markdown_file)
markdown_processor = MarkdownProcessor(db_manager)
markdown_processor.process_markdown(markdown_file, document_id)
print(f"Calendar for year {year} has been generated and stored in the database.") print(f"Calendar for year {year} has been generated and stored in the database.")
def main(): def main():
""" """
This script processes a markdown file, updates an SQLite database, This script processes a markdown file, updates an SQLite database,
and optionally selects a topic based on user input. and optionally selects a topic based on user input.
Initializes managers for database and markdown handling. Updates documents based on
MD5 hash changes, and if a topic is provided, retrieves and writes its content
to the markdown file.
Args:
-m, --markdown: Path to markdown file (default: 'calendar.md').
-d, --database: Path to SQLite database file (default: 'markdown.db').
topic_title: Optional topic for content selection (fuzzy matching enabled).
--bootstrap: If provided, generates markdown calendar for the current year and loads it to the database.
--ls: If provided, lists all available headings.
--html: If provided, will produce {filename}.html file along the markdown file.
""" """
# Set up command-line argument parsing
parser = argparse.ArgumentParser(description='Process markdown file and optionally select a topic.') parser = argparse.ArgumentParser(description='Process markdown file and optionally select a topic.')
parser.add_argument('-m', '--markdown', type=str, default='calendar.md', help='Input/output markdown file (default: calendar.md)') parser.add_argument('-m', '--markdown', type=str, default='calendar.md', help='Input/output markdown file (default: calendar.md)')
parser.add_argument('-d', '--database', type=str, default='markdown.db', help='SQLite database file (default: markdown.db)') parser.add_argument('-d', '--database', type=str, default='markdown.db', help='SQLite database file (default: markdown.db)')
parser.add_argument('topic_title', nargs='?', type=str, help='Topic title to select (fuzzy matching enabled)') parser.add_argument('topic_title', nargs='?', type=str, help='Topic title to select (fuzzy matching enabled)')
parser.add_argument('--bootstrap', action='store_true', help='Generate markdown calendar for the current year and load it to the database.') parser.add_argument('--bootstrap', action='store_true', help='Generate markdown calendar for the current year and load it to the database.')
parser.add_argument('--ls', action='store_true', help='List all available headings.') parser.add_argument('--ls', action='store_true', help='List all available headings.')
parser.add_argument('--html', action='store_true', help='Generate an HTML version of the output') parser.add_argument('--html', action='store_true', help='Generate an HTML version of the output.')
parser.add_argument('--uuid', type=str, help='Specify a UUID to retrieve content.')
parser.add_argument('--debug', action='store_true', help='Enable debug printing') parser.add_argument('--debug', action='store_true', help='Enable debug printing')
args = parser.parse_args() args = parser.parse_args()
# Set up logging # Setup basic logging
logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)
# Check for markdown file presence
# Use the provided or default file paths if not os.path.exists(args.markdown) and not args.bootstrap:
markdown_file = args.markdown print(f"Error: Markdown file '{args.markdown}' not found. Use --bootstrap to create a new calendar.")
database_file = args.database db_manager.close()
return
# Initialize manager objects for database operations # Check for databse file presence
db_manager = DatabaseManager(database_file) if not os.path.exists(args.database) and not args.bootstrap:
print(f"Error: Database file '{args.database}' not found. Use --bootstrap to create a new calendar.")
if args.bootstrap:
bootstrap_calendar(datetime.now().year, db_manager, markdown_file)
db_manager.close() db_manager.close()
return return
# Initialize manager objects
db_manager = DatabaseManager(args.database)
if args.bootstrap:
print('## Running calendar bootstrap')
bootstrap_calendar(datetime.now().year, db_manager, args.markdown)
db_manager.close()
return
document_manager = DocumentManager(db_manager)
if args.ls: if args.ls:
topic_reader = TopicReader(db_manager) topic_reader = TopicReader(db_manager)
print(topic_reader.list_headings()) print(topic_reader.list_headings())
db_manager.close() db_manager.close()
return return
# Check if the markdown file exists # Topic or UUID-based content retrieval
if not os.path.exists(markdown_file): if args.topic_title or args.uuid:
print(f"Error: Markdown file '{markdown_file}' not found. Use --bootstrap to create a new calendar.")
db_manager.close()
return
document_manager = DocumentManager(db_manager)
markdown_processor = MarkdownProcessor(db_manager)
# Get the base name of the markdown file (without path)
document_name = os.path.basename(markdown_file)
# Check if the document already exists in the database
document = db_manager.cursor.execute('SELECT id, file_path, md5sum, updated_timestamp FROM documents WHERE name = ?', (document_name,)).fetchone()
# Compute the current MD5 hash of the markdown file
current_file_hash = compute_file_hash(markdown_file)
if document:
# If the document exists in the database
document_id, stored_file_path, stored_md5sum, last_updated = document
if stored_file_path != markdown_file:
print(f"Updating file path for '{document_name}' in the database...")
document_manager.update_document(document_id, file_path=markdown_file)
if stored_md5sum != current_file_hash:
# If the file has changed since last update
print(f"File '{document_name}' has changed. Updating the database...")
document_manager.update_document_hash(document_id, current_file_hash)
markdown_processor.process_markdown(markdown_file, document_id)
else:
# If the file hasn't changed
print(f"File '{document_name}' has not changed. Skipping update.")
else:
# If the document doesn't exist in the database
print(f"Document '{document_name}' not found in the database. Adding new entry...")
document_id = document_manager.create_document(document_name, markdown_file)
document_manager.update_document_hash(document_id, current_file_hash)
markdown_processor.process_markdown(markdown_file, document_id)
# Check if a topic title argument is provided
if args.topic_title:
# Initialize TopicReader
topic_reader = TopicReader(db_manager) topic_reader = TopicReader(db_manager)
# Retrieve the content for the specified topic if args.uuid:
result = topic_reader.get_topic_content(args.topic_title) content = topic_reader.build_full_content(args.uuid)
if result:
# If content is found, write it back to the original markdown file
with open(markdown_file, 'w', encoding='utf-8') as file:
file.write(result)
file.write('\n')
print(f"Selected topic and subtopics written to {markdown_file}")
# Generate HTML if --html option is specified
if args.html:
html_file = os.path.splitext(markdown_file)[0] + '.html'
html_content = convert_to_html(result)
with open(html_file, 'w', encoding='utf-8') as file:
file.write(html_content)
print(f"HTML version written to {html_file}")
# Update the document hash in the database
new_file_hash = compute_file_hash(markdown_file)
document_manager.update_document_hash(document_id, new_file_hash)
else: else:
# If no content is found content = topic_reader.get_topic_content(args.topic_title)
print("No result to write. The original file remains unchanged.")
else:
print("No topic title provided. The database has been updated/added without modifying the file.")
# Generate HTML for the entire document if --html option is specified if content:
# Write the selected content to the markdown file
with open(args.markdown, 'w', encoding='utf-8') as file:
file.write(content)
file.write('\n')
print(f"Selected content written to {args.markdown}")
# Optionally convert to HTML
if args.html: if args.html:
with open(markdown_file, 'r', encoding='utf-8') as file: html_file = f"{args.markdown}.html"
markdown_content = file.read()
html_file = os.path.splitext(markdown_file)[0] + '.html'
html_content = convert_to_html(markdown_content)
with open(html_file, 'w', encoding='utf-8') as file: with open(html_file, 'w', encoding='utf-8') as file:
file.write(html_content) file.write(convert_to_html(content))
print(f"HTML version of the entire document written to {html_file}") print(f"HTML version written to {html_file}")
# Close the database connection
db_manager.close() db_manager.close()
if __name__ == '__main__': if __name__ == '__main__':
main() main()