Convert GPB Notes to MD Files
Leverage Google Docs’ download function, selecting Markdown format. Subsequently, process the Markdown file with a Python script to handle formatting.
There’s no need to differentiate between various highlight colors for me; the conversion will commence directly from the “All your annotations” section of the document. The “ideas” section will also be omitted, as it typically contains keywords rather than complete sentences and thus isn’t necessary for inclusion in the excerpts.
This script performs only the following three operations:
- Converts chapter titles.
- Formats excerpts.
- Sets the format for the converted filenames.
import re
import os
def format_output_filename(input_filename):
# Extract content between underscores and comma if present
base_name = os.path.splitext(input_filename)[0]
match = re.search(r"_(.*?)(?:,|$)", base_name)
if match:
return f"Notes_{match.group(1).replace(' ', '-')}.md"
parts = base_name.split("_")
if len(parts) > 1:
return f"Notes_{parts[0]}-{parts[1]}.md"
return f"Notes_{parts[0]}.md"
def process_md_file(input_path, output_folder):
with open(input_path, "r", encoding="utf-8") as file:
content = file.readlines()
# Step 1: Find the start index of annotations section
start_idx = next((i for i, line in enumerate(content) if line.strip() == "# All your annotations"), None)
if start_idx is None:
raise ValueError("No '# All your annotations' found in the file.")
# Step 2: Keep only content below '# All your annotations'
content = content[start_idx + 1:]
output_lines = []
# Step 3: Regular expression patterns
title_pattern = re.compile(r"## \*(.*?)\*") # Title
annotation_pattern_1 = re.compile(r"\|\s*\*!?\[\]\[image\d*\]\s*(.*?)\*\s*.*?\[\d+\]\(http.*?\)\s*\|")
annotation_pattern_2 = re.compile(r"\|\s*!?\[\]\[image\d*\]\s*\*?(.*?)\*\s*.*?\[\d+\]\(http.*?\)\s*\|") # Annotations
current_title = None
annotations = []
# Step 4: Process the file content
for line in content:
title_match = title_pattern.match(line)
annotation_match = annotation_pattern_1.search(line) or annotation_pattern_2.search(line)
if title_match:
# Save the previous title's annotations before moving to a new one
if current_title and annotations:
output_lines.append(f"### {current_title}\n\n" + "\n\n".join(annotations) + "\n")
# Start a new title section
current_title = title_match.group(1).strip()
annotations = [] # Reset annotations for the new title
elif annotation_match:
annotation = annotation_match.group(1).strip().replace(r"\!", "!")
annotations.append(f"> {annotation}") # Add the annotation
# Save the last collected annotations
if current_title and annotations:
output_lines.append(f"### {current_title}\n\n" + "\n\n".join(annotations) + "\n")
# Step 5: Remove any image references or extra content at the end
output_text = "\n".join(output_lines)
output_text = re.sub(r"\[image\d*\]:.*", "", output_text)
# Step 6: Format output filename
input_filename = os.path.basename(input_path)
output_filename = format_output_filename(input_filename)
output_path = os.path.join(output_folder, output_filename)
# Step 7: Save output to file
os.makedirs(output_folder, exist_ok=True)
with open(output_path, "w", encoding="utf-8") as file:
file.write(output_text)
print(f"Processed file saved at: {output_path}")
# Example usage
input_md_file = "input_file_path"
output_directory = "output folder"
process_md_file(input_md_file, output_directory)