""" Generate synthetic manual-labeled data for ML model training/demo. Run this script after metadata has been scanned so there are columns to label. """ import random import sys import os sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) from app.core.database import SessionLocal from app.models.metadata import DataColumn from app.models.classification import Category from app.models.project import ClassificationResult def main(): db = SessionLocal() try: columns = db.query(DataColumn).limit(300).all() if not columns: print("No columns found in database. Please scan a data source first.") return categories = db.query(Category).filter(Category.level == 2).all() if not categories: print("No sub-categories found.") return # Clear old manual labels to avoid duplicates db.query(ClassificationResult).filter(ClassificationResult.source == "manual").delete() db.commit() count = 0 for col in columns: # Deterministic pseudo-random based on column name for reproducibility rng = random.Random(col.name) cat = rng.choice(categories) # Create a fake manual result (project_id=1 assumed to exist or None) result = ClassificationResult( project_id=None, column_id=col.id, category_id=cat.id, level_id=cat.parent.level if cat.parent else 3, # fallback source="manual", confidence=1.0, status="manual", ) db.add(result) count += 1 db.commit() print(f"Generated {count} manual labels across {len(categories)} categories.") finally: db.close() if __name__ == "__main__": main()