Files
prop-data-guard/backend/scripts/generate_ml_training_data.py
hiderfong 6d70520e79 feat: 全量功能模块开发与集成测试修复
- 新增后端模块:Alert、APIAsset、Compliance、Lineage、Masking、Risk、SchemaChange、Unstructured、Watermark
- 新增前端模块页面与API接口
- 新增Alembic迁移脚本(002-014)覆盖全量业务表
- 新增测试数据生成脚本与集成测试脚本
- 修复metadata模型JSON类型导入缺失导致启动失败的问题
- 修复前端Alert/APIAsset页面request模块路径错误
- 更新docker-compose与开发计划文档
2026-04-25 08:51:38 +08:00

60 lines
1.8 KiB
Python

"""
Generate synthetic manual-labeled data for ML model training/demo.
Run this script after metadata has been scanned so there are columns to label.
"""
import random
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from app.core.database import SessionLocal
from app.models.metadata import DataColumn
from app.models.classification import Category
from app.models.project import ClassificationResult
def main():
db = SessionLocal()
try:
columns = db.query(DataColumn).limit(300).all()
if not columns:
print("No columns found in database. Please scan a data source first.")
return
categories = db.query(Category).filter(Category.level == 2).all()
if not categories:
print("No sub-categories found.")
return
# Clear old manual labels to avoid duplicates
db.query(ClassificationResult).filter(ClassificationResult.source == "manual").delete()
db.commit()
count = 0
for col in columns:
# Deterministic pseudo-random based on column name for reproducibility
rng = random.Random(col.name)
cat = rng.choice(categories)
# Create a fake manual result (project_id=1 assumed to exist or None)
result = ClassificationResult(
project_id=None,
column_id=col.id,
category_id=cat.id,
level_id=cat.parent.level if cat.parent else 3, # fallback
source="manual",
confidence=1.0,
status="manual",
)
db.add(result)
count += 1
db.commit()
print(f"Generated {count} manual labels across {len(categories)} categories.")
finally:
db.close()
if __name__ == "__main__":
main()