feat: 全量功能模块开发与集成测试修复

- 新增后端模块:Alert、APIAsset、Compliance、Lineage、Masking、Risk、SchemaChange、Unstructured、Watermark
- 新增前端模块页面与API接口
- 新增Alembic迁移脚本(002-014)覆盖全量业务表
- 新增测试数据生成脚本与集成测试脚本
- 修复metadata模型JSON类型导入缺失导致启动失败的问题
- 修复前端Alert/APIAsset页面request模块路径错误
- 更新docker-compose与开发计划文档
This commit is contained in:
hiderfong
2026-04-25 08:51:38 +08:00
parent 8b2bc84399
commit 6d70520e79
110 changed files with 6125 additions and 87 deletions
+44 -5
View File
@@ -51,11 +51,39 @@ def match_rule(rule: RecognitionRule, column: DataColumn) -> Tuple[bool, float]:
if t.strip().lower() in enums:
return True, 0.90
elif rule.rule_type == "similarity":
benchmarks = [b.strip().lower() for b in rule.rule_content.split(",") if b.strip()]
if not benchmarks:
return False, 0.0
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
texts = [t.lower() for t in targets] + benchmarks
try:
vectorizer = TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 3))
tfidf = vectorizer.fit_transform(texts)
target_vecs = tfidf[:len(targets)]
bench_vecs = tfidf[len(targets):]
sim_matrix = cosine_similarity(target_vecs, bench_vecs)
max_sim = float(sim_matrix.max())
if max_sim >= 0.75:
return True, round(min(max_sim, 0.99), 4)
except Exception:
pass
return False, 0.0
def run_auto_classification(db: Session, project_id: int, source_ids: Optional[List[int]] = None) -> dict:
"""Run automatic classification for a project."""
def run_auto_classification(
db: Session,
project_id: int,
source_ids: Optional[List[int]] = None,
progress_callback=None,
) -> dict:
"""Run automatic classification for a project.
Args:
progress_callback: Optional callable(scanned, matched, total) to report progress.
"""
project = db.query(ClassificationProject).filter(ClassificationProject.id == project_id).first()
if not project:
return {"success": False, "message": "项目不存在"}
@@ -82,7 +110,10 @@ def run_auto_classification(db: Session, project_id: int, source_ids: Optional[L
columns = columns_query.all()
matched_count = 0
for col in columns:
total = len(columns)
report_interval = max(1, total // 20) # report ~20 times
for idx, col in enumerate(columns):
# Check if already has a result for this project
existing = db.query(ClassificationResult).filter(
ClassificationResult.project_id == project_id,
@@ -121,12 +152,20 @@ def run_auto_classification(db: Session, project_id: int, source_ids: Optional[L
# Increment hit count
best_rule.hit_count = (best_rule.hit_count or 0) + 1
# Report progress periodically
if progress_callback and (idx + 1) % report_interval == 0:
progress_callback(scanned=idx + 1, matched=matched_count, total=total)
db.commit()
# Final progress report
if progress_callback:
progress_callback(scanned=total, matched=matched_count, total=total)
return {
"success": True,
"message": f"自动分类完成,共扫描 {len(columns)} 个字段,命中 {matched_count}",
"scanned": len(columns),
"message": f"自动分类完成,共扫描 {total} 个字段,命中 {matched_count}",
"scanned": total,
"matched": matched_count,
}