feat: 全量功能模块开发与集成测试修复

- 新增后端模块：Alert、APIAsset、Compliance、Lineage、Masking、Risk、SchemaChange、Unstructured、Watermark - 新增前端模块页面与API接口 - 新增Alembic迁移脚本(002-014)覆盖全量业务表 - 新增测试数据生成脚本与集成测试脚本 - 修复metadata模型JSON类型导入缺失导致启动失败的问题 - 修复前端Alert/APIAsset页面request模块路径错误 - 更新docker-compose与开发计划文档
2026-04-25 08:51:38 +08:00
parent 8b2bc84399
commit 6d70520e79
110 changed files with 6125 additions and 87 deletions
@@ -51,11 +51,39 @@ def match_rule(rule: RecognitionRule, column: DataColumn) -> Tuple[bool, float]:
            if t.strip().lower() in enums:
                return True, 0.90

+    elif rule.rule_type == "similarity":
+        benchmarks = [b.strip().lower() for b in rule.rule_content.split(",") if b.strip()]
+        if not benchmarks:
+            return False, 0.0
+        from sklearn.feature_extraction.text import TfidfVectorizer
+        from sklearn.metrics.pairwise import cosine_similarity
+        texts = [t.lower() for t in targets] + benchmarks
+        try:
+            vectorizer = TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 3))
+            tfidf = vectorizer.fit_transform(texts)
+            target_vecs = tfidf[:len(targets)]
+            bench_vecs = tfidf[len(targets):]
+            sim_matrix = cosine_similarity(target_vecs, bench_vecs)
+            max_sim = float(sim_matrix.max())
+            if max_sim >= 0.75:
+                return True, round(min(max_sim, 0.99), 4)
+        except Exception:
+            pass
+
    return False, 0.0


-def run_auto_classification(db: Session, project_id: int, source_ids: Optional[List[int]] = None) -> dict:
-    """Run automatic classification for a project."""
+def run_auto_classification(
+    db: Session,
+    project_id: int,
+    source_ids: Optional[List[int]] = None,
+    progress_callback=None,
+) -> dict:
+    """Run automatic classification for a project.
+
+    Args:
+        progress_callback: Optional callable(scanned, matched, total) to report progress.
+    """
    project = db.query(ClassificationProject).filter(ClassificationProject.id == project_id).first()
    if not project:
        return {"success": False, "message": "项目不存在"}
@@ -82,7 +110,10 @@ def run_auto_classification(db: Session, project_id: int, source_ids: Optional[L
    columns = columns_query.all()

    matched_count = 0
-    for col in columns:
+    total = len(columns)
+    report_interval = max(1, total // 20)  # report ~20 times
+
+    for idx, col in enumerate(columns):
        # Check if already has a result for this project
        existing = db.query(ClassificationResult).filter(
            ClassificationResult.project_id == project_id,
@@ -121,12 +152,20 @@ def run_auto_classification(db: Session, project_id: int, source_ids: Optional[L
            # Increment hit count
            best_rule.hit_count = (best_rule.hit_count or 0) + 1

+        # Report progress periodically
+        if progress_callback and (idx + 1) % report_interval == 0:
+            progress_callback(scanned=idx + 1, matched=matched_count, total=total)
+
    db.commit()

+    # Final progress report
+    if progress_callback:
+        progress_callback(scanned=total, matched=matched_count, total=total)
+
    return {
        "success": True,
-        "message": f"自动分类完成，共扫描 {len(columns)} 个字段，命中 {matched_count} 个",
-        "scanned": len(columns),
+        "message": f"自动分类完成，共扫描 {total} 个字段，命中 {matched_count} 个",
+        "scanned": total,
        "matched": matched_count,
    }