6d70520e79
- 新增后端模块:Alert、APIAsset、Compliance、Lineage、Masking、Risk、SchemaChange、Unstructured、Watermark - 新增前端模块页面与API接口 - 新增Alembic迁移脚本(002-014)覆盖全量业务表 - 新增测试数据生成脚本与集成测试脚本 - 修复metadata模型JSON类型导入缺失导致启动失败的问题 - 修复前端Alert/APIAsset页面request模块路径错误 - 更新docker-compose与开发计划文档
60 lines
1.8 KiB
Python
60 lines
1.8 KiB
Python
"""
|
|
Generate synthetic manual-labeled data for ML model training/demo.
|
|
Run this script after metadata has been scanned so there are columns to label.
|
|
"""
|
|
import random
|
|
import sys
|
|
import os
|
|
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
|
|
|
from app.core.database import SessionLocal
|
|
from app.models.metadata import DataColumn
|
|
from app.models.classification import Category
|
|
from app.models.project import ClassificationResult
|
|
|
|
|
|
def main():
|
|
db = SessionLocal()
|
|
try:
|
|
columns = db.query(DataColumn).limit(300).all()
|
|
if not columns:
|
|
print("No columns found in database. Please scan a data source first.")
|
|
return
|
|
|
|
categories = db.query(Category).filter(Category.level == 2).all()
|
|
if not categories:
|
|
print("No sub-categories found.")
|
|
return
|
|
|
|
# Clear old manual labels to avoid duplicates
|
|
db.query(ClassificationResult).filter(ClassificationResult.source == "manual").delete()
|
|
db.commit()
|
|
|
|
count = 0
|
|
for col in columns:
|
|
# Deterministic pseudo-random based on column name for reproducibility
|
|
rng = random.Random(col.name)
|
|
cat = rng.choice(categories)
|
|
# Create a fake manual result (project_id=1 assumed to exist or None)
|
|
result = ClassificationResult(
|
|
project_id=None,
|
|
column_id=col.id,
|
|
category_id=cat.id,
|
|
level_id=cat.parent.level if cat.parent else 3, # fallback
|
|
source="manual",
|
|
confidence=1.0,
|
|
status="manual",
|
|
)
|
|
db.add(result)
|
|
count += 1
|
|
|
|
db.commit()
|
|
print(f"Generated {count} manual labels across {len(categories)} categories.")
|
|
finally:
|
|
db.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|