Files
codetutor/backend/scripts/load_data.py

281 lines
9.5 KiB
Python

#!/usr/bin/env python
"""Load YAML content data into the database."""
import asyncio
import sys
from pathlib import Path
from typing import Any
import yaml
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.orm import selectinload
# Add src to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))
from src.db.database import async_session_factory
from src.models import Category, Difficulty, Explanation, Pattern, Question, Solution
async def load_categories(session: AsyncSession, data_dir: Path) -> dict[str, Category]:
"""Load categories from YAML file."""
categories_file = data_dir / "categories" / "categories.yaml"
if not categories_file.exists():
print(f"Warning: {categories_file} not found")
return {}
with open(categories_file) as f:
data = yaml.safe_load(f)
categories: dict[str, Category] = {}
for item in data.get("categories", []):
result = await session.execute(select(Category).where(Category.slug == item["slug"]))
existing = result.scalar_one_or_none()
if existing:
existing.name = item["name"]
existing.description = item.get("description")
categories[item["slug"]] = existing
else:
category = Category(
name=item["name"],
slug=item["slug"],
description=item.get("description"),
)
session.add(category)
categories[item["slug"]] = category
await session.flush()
print(f"Loaded {len(categories)} categories")
return categories
async def load_patterns(session: AsyncSession, data_dir: Path) -> dict[str, Pattern]:
"""Load patterns from YAML files.
Supports both:
- Legacy single file: patterns/patterns.yaml
- Individual files: patterns/<slug>.yaml (preferred for tutorials)
"""
patterns_dir = data_dir / "patterns"
patterns: dict[str, Pattern] = {}
# First, try loading individual pattern files (preferred for tutorials)
individual_files = list(patterns_dir.glob("*.yaml"))
# Filter out the legacy patterns.yaml file
individual_files = [f for f in individual_files if f.name != "patterns.yaml"]
if individual_files:
for pattern_file in individual_files:
with open(pattern_file) as f:
item = yaml.safe_load(f)
if not item or "slug" not in item:
print(f" Warning: Skipping {pattern_file.name} - missing slug")
continue
pattern = await _upsert_pattern(session, item)
patterns[item["slug"]] = pattern
print(f" Loaded: {item['name']}")
# Fall back to legacy patterns.yaml if no individual files found
legacy_file = patterns_dir / "patterns.yaml"
if legacy_file.exists():
with open(legacy_file) as f:
data = yaml.safe_load(f)
for item in data.get("patterns", []):
# Skip if already loaded from individual file
if item["slug"] in patterns:
continue
pattern = await _upsert_pattern(session, item)
patterns[item["slug"]] = pattern
await session.flush()
print(f"Loaded {len(patterns)} patterns")
return patterns
async def _upsert_pattern(session: AsyncSession, item: dict[str, Any]) -> Pattern:
"""Insert or update a single pattern from YAML data."""
result = await session.execute(select(Pattern).where(Pattern.slug == item["slug"]))
existing = result.scalar_one_or_none()
if existing:
pattern = existing
else:
pattern = Pattern(slug=item["slug"])
session.add(pattern)
# Core fields
pattern.name = item["name"]
pattern.description = item.get("description")
pattern.when_to_use = item.get("when_to_use")
# Tutorial content fields
pattern.metaphor = item.get("metaphor")
pattern.core_concept = item.get("core_concept")
pattern.visualization = item.get("visualization")
pattern.code_template = item.get("code_template")
# Structured data fields (JSONB)
pattern.recognition_signals = item.get("recognition_signals")
pattern.common_mistakes = item.get("common_mistakes")
pattern.variations = item.get("variations")
pattern.related_patterns = item.get("related_patterns")
pattern.prerequisite_patterns = item.get("prerequisite_patterns")
# Difficulty level
pattern.difficulty_level = item.get("difficulty_level")
return pattern
async def load_question(
session: AsyncSession,
question_file: Path,
categories: dict[str, Category],
patterns: dict[str, Pattern],
) -> None:
"""Load a single question from YAML file."""
with open(question_file) as f:
data: dict[str, Any] = yaml.safe_load(f)
slug = data["slug"]
result = await session.execute(
select(Question)
.where(Question.slug == slug)
.options(
selectinload(Question.explanation),
selectinload(Question.solutions),
selectinload(Question.categories),
selectinload(Question.patterns),
)
)
existing = result.scalar_one_or_none()
if existing:
question = existing
question.title = data["title"]
question.difficulty = Difficulty(data["difficulty"].lower())
question.description = data["description"]
question.constraints = data.get("constraints")
question.examples = data.get("examples")
question.leetcode_id = data.get("leetcode_id")
question.leetcode_url = data.get("leetcode_url")
question.function_signature = data.get("function_signature")
question.test_cases = data.get("test_cases")
else:
question = Question(
title=data["title"],
slug=slug,
difficulty=Difficulty(data["difficulty"].lower()),
description=data["description"],
constraints=data.get("constraints"),
examples=data.get("examples"),
leetcode_id=data.get("leetcode_id"),
leetcode_url=data.get("leetcode_url"),
function_signature=data.get("function_signature"),
test_cases=data.get("test_cases"),
)
session.add(question)
# Link categories
question.categories = [
categories[cat_slug] for cat_slug in data.get("categories", []) if cat_slug in categories
]
# Link patterns
question.patterns = [
patterns[pat_slug] for pat_slug in data.get("patterns", []) if pat_slug in patterns
]
await session.flush()
# Handle explanation
if "explanation" in data:
exp_data = data["explanation"]
if existing and existing.explanation:
explanation = question.explanation
explanation.approach = exp_data["approach"]
explanation.intuition = exp_data["intuition"]
explanation.common_pitfalls = exp_data.get("common_pitfalls")
explanation.key_takeaways = exp_data.get("key_takeaways")
explanation.time_complexity = exp_data["time_complexity"]
explanation.space_complexity = exp_data["space_complexity"]
explanation.complexity_explanation = exp_data.get("complexity_explanation")
else:
explanation = Explanation(
question_id=question.id,
approach=exp_data["approach"],
intuition=exp_data["intuition"],
common_pitfalls=exp_data.get("common_pitfalls"),
key_takeaways=exp_data.get("key_takeaways"),
time_complexity=exp_data["time_complexity"],
space_complexity=exp_data["space_complexity"],
complexity_explanation=exp_data.get("complexity_explanation"),
)
session.add(explanation)
# Handle solutions (delete existing and recreate)
if existing and existing.solutions:
for sol in existing.solutions:
await session.delete(sol)
await session.flush()
for sol_data in data.get("solutions", []):
solution = Solution(
question_id=question.id,
approach_name=sol_data["approach_name"],
code=sol_data["code"],
language=sol_data.get("language", "python"),
is_optimal=sol_data.get("is_optimal", False),
explanation=sol_data.get("explanation"),
)
session.add(solution)
print(f" Loaded: {data['title']}")
async def load_questions(
session: AsyncSession,
data_dir: Path,
categories: dict[str, Category],
patterns: dict[str, Pattern],
) -> int:
"""Load all questions from YAML files."""
questions_dir = data_dir / "questions"
if not questions_dir.exists():
print(f"Warning: {questions_dir} not found")
return 0
count = 0
for question_file in sorted(questions_dir.glob("*.yaml")):
await load_question(session, question_file, categories, patterns)
count += 1
return count
async def main() -> None:
"""Load all content data into the database."""
data_dir = Path(__file__).parent.parent / "data"
print("Loading content data...")
print(f"Data directory: {data_dir}")
async with async_session_factory() as session:
categories = await load_categories(session, data_dir)
patterns = await load_patterns(session, data_dir)
question_count = await load_questions(session, data_dir, categories, patterns)
await session.commit()
print(f"\nDone! Loaded {question_count} questions.")
if __name__ == "__main__":
asyncio.run(main())