Fixing the heading lineage problem. Trying to figure out the double headings on output...

This commit is contained in:
kalzu rekku 2024-10-04 09:21:43 +03:00
parent 5b975c5304
commit 22ff45530d

View File

@ -159,27 +159,26 @@ class MarkdownProcessor:
def store_markdown_content(self, tokens: List, document_id: int) -> None: def store_markdown_content(self, tokens: List, document_id: int) -> None:
"""Store parsed markdown content in the database.""" """Store parsed markdown content in the database."""
parent_stack: List[Optional[int]] = [] parent_stack: List[Tuple[int, int]] = [] # (level, heading_id)
current_heading_id = None
for token in tokens: for token in tokens:
if token.type == 'heading_open': if token.type == 'heading_open':
level = int(token.tag.strip('h')) level = int(token.tag.strip('h'))
content_token = tokens[tokens.index(token) + 1] content_token = tokens[tokens.index(token) + 1]
title = content_token.content title = content_token.content
parent_id = parent_stack[-1] if parent_stack else None # Find the appropriate parent
while parent_stack and parent_stack[-1][0] >= level:
parent_stack.pop()
heading_id = self.insert_heading(level, title, parent_id, document_id) parent_id = parent_stack[-1][1] if parent_stack else None
current_heading_id = self.insert_heading(level, title, parent_id, document_id)
if not parent_stack or level > len(parent_stack): parent_stack.append((level, current_heading_id))
parent_stack.append(heading_id) elif token.type == 'inline' and current_heading_id and token.content.strip():
else: # Only insert non-empty content that's not part of a heading
while parent_stack and level <= len(parent_stack): if tokens[tokens.index(token) - 1].type != 'heading_open':
parent_stack.pop() self.insert_body(token.content, current_heading_id, document_id)
parent_stack.append(heading_id)
elif token.type == 'inline' and parent_stack:
self.insert_body(token.content, parent_stack[-1], document_id)
self.db_manager.conn.commit() self.db_manager.conn.commit()
@ -276,7 +275,8 @@ class TopicReader:
body_content = '\n'.join([row[0] for row in rows]) body_content = '\n'.join([row[0] for row in rows])
# Write the heading once and then its body content # Write the heading once and then its body content
result = f"{'#' * level} {title}\n{body_content.strip()}\n" #result = f"\n{'#' * level} {title}\n{body_content.strip()}\n"
result = f"{'#' * level} {title}\n\n{body_content.strip()}\n"
if include_subtopics: if include_subtopics:
# Fetch all subtopics (e.g., days) that are children of the current heading # Fetch all subtopics (e.g., days) that are children of the current heading
@ -286,7 +286,8 @@ class TopicReader:
subtopic_content = self.fetch_body_and_subtopics(subtopic_id, include_subtopics=True) subtopic_content = self.fetch_body_and_subtopics(subtopic_id, include_subtopics=True)
result += subtopic_content result += subtopic_content
return result.strip() # Strip extra newlines #return result.strip() # Strip extra newlines
return result
def _fetch_subtopics(self, heading_id: int, parent_level: int) -> List[Tuple[int, int, str]]: def _fetch_subtopics(self, heading_id: int, parent_level: int) -> List[Tuple[int, int, str]]:
""" """
@ -325,7 +326,7 @@ class TopicReader:
result += self.fetch_body_and_subtopics(id, include_subtopics=True) result += self.fetch_body_and_subtopics(id, include_subtopics=True)
else: else:
# Include only the heading chain without duplicating content # Include only the heading chain without duplicating content
result += f"{'#' * level} {title}\n\n" result += f"\n{'#' * level} {title}\n\n"
return result.strip() # Ensure there are no trailing newlines return result.strip() # Ensure there are no trailing newlines
print(f"No topic found matching '{input_title}'.") print(f"No topic found matching '{input_title}'.")
return None return None