Convert GPB Notes to MD Files

Jun 11, 2024

Leverage Google Docs’ download function, selecting Markdown format. Subsequently, process the Markdown file with a Python script to handle formatting.

There’s no need to differentiate between various highlight colors for me; the conversion will commence directly from the “All your annotations” section of the document. The “ideas” section will also be omitted, as it typically contains keywords rather than complete sentences and thus isn’t necessary for inclusion in the excerpts.

This script performs only the following three operations:

Converts chapter titles.
Formats excerpts.
Sets the format for the converted filenames.

import re
import os

def format_output_filename(input_filename):
    # Extract content between underscores and comma if present
    base_name = os.path.splitext(input_filename)[0]
    match = re.search(r"_(.*?)(?:,|$)", base_name)

    if match:
        return f"Notes_{match.group(1).replace(' ', '-')}.md"
    parts = base_name.split("_")

    if len(parts) > 1:
        return f"Notes_{parts[0]}-{parts[1]}.md"

    return f"Notes_{parts[0]}.md"

def process_md_file(input_path, output_folder):

    with open(input_path, "r", encoding="utf-8") as file:
        content = file.readlines()

    # Step 1: Find the start index of annotations section
    start_idx = next((i for i, line in enumerate(content) if line.strip() == "# All your annotations"), None)

    if start_idx is None:
        raise ValueError("No '# All your annotations' found in the file.")


    # Step 2: Keep only content below '# All your annotations'
    content = content[start_idx + 1:]
    output_lines = []


    # Step 3: Regular expression patterns
    title_pattern = re.compile(r"## \*(.*?)\*")  # Title
    annotation_pattern_1 = re.compile(r"\|\s*\*!?\[\]\[image\d*\]\s*(.*?)\*\s*.*?\[\d+\]\(http.*?\)\s*\|")
    annotation_pattern_2 = re.compile(r"\|\s*!?\[\]\[image\d*\]\s*\*?(.*?)\*\s*.*?\[\d+\]\(http.*?\)\s*\|")  # Annotations


    current_title = None
    annotations = []

    # Step 4: Process the file content
    for line in content:
        title_match = title_pattern.match(line)
        annotation_match = annotation_pattern_1.search(line) or annotation_pattern_2.search(line)


        if title_match:
            # Save the previous title's annotations before moving to a new one
            if current_title and annotations:
                output_lines.append(f"### {current_title}\n\n" + "\n\n".join(annotations) + "\n")

            # Start a new title section
            current_title = title_match.group(1).strip()
            annotations = []  # Reset annotations for the new title

        elif annotation_match:
            annotation = annotation_match.group(1).strip().replace(r"\!", "!")
            annotations.append(f"> {annotation}")  # Add the annotation

    # Save the last collected annotations
    if current_title and annotations:
        output_lines.append(f"### {current_title}\n\n" + "\n\n".join(annotations) + "\n")

    # Step 5: Remove any image references or extra content at the end
    output_text = "\n".join(output_lines)
    output_text = re.sub(r"\[image\d*\]:.*", "", output_text)

    # Step 6: Format output filename
    input_filename = os.path.basename(input_path)
    output_filename = format_output_filename(input_filename)
    output_path = os.path.join(output_folder, output_filename)

    # Step 7: Save output to file

    os.makedirs(output_folder, exist_ok=True)
    with open(output_path, "w", encoding="utf-8") as file:
        file.write(output_text)

    print(f"Processed file saved at: {output_path}")

# Example usage

input_md_file = "input_file_path"

output_directory = "output folder"

process_md_file(input_md_file, output_directory)