feat: 全量功能模块开发与集成测试修复
- 新增后端模块:Alert、APIAsset、Compliance、Lineage、Masking、Risk、SchemaChange、Unstructured、Watermark - 新增前端模块页面与API接口 - 新增Alembic迁移脚本(002-014)覆盖全量业务表 - 新增测试数据生成脚本与集成测试脚本 - 修复metadata模型JSON类型导入缺失导致启动失败的问题 - 修复前端Alert/APIAsset页面request模块路径错误 - 更新docker-compose与开发计划文档
This commit is contained in:
@@ -51,11 +51,39 @@ def match_rule(rule: RecognitionRule, column: DataColumn) -> Tuple[bool, float]:
|
||||
if t.strip().lower() in enums:
|
||||
return True, 0.90
|
||||
|
||||
elif rule.rule_type == "similarity":
|
||||
benchmarks = [b.strip().lower() for b in rule.rule_content.split(",") if b.strip()]
|
||||
if not benchmarks:
|
||||
return False, 0.0
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
texts = [t.lower() for t in targets] + benchmarks
|
||||
try:
|
||||
vectorizer = TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 3))
|
||||
tfidf = vectorizer.fit_transform(texts)
|
||||
target_vecs = tfidf[:len(targets)]
|
||||
bench_vecs = tfidf[len(targets):]
|
||||
sim_matrix = cosine_similarity(target_vecs, bench_vecs)
|
||||
max_sim = float(sim_matrix.max())
|
||||
if max_sim >= 0.75:
|
||||
return True, round(min(max_sim, 0.99), 4)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return False, 0.0
|
||||
|
||||
|
||||
def run_auto_classification(db: Session, project_id: int, source_ids: Optional[List[int]] = None) -> dict:
|
||||
"""Run automatic classification for a project."""
|
||||
def run_auto_classification(
|
||||
db: Session,
|
||||
project_id: int,
|
||||
source_ids: Optional[List[int]] = None,
|
||||
progress_callback=None,
|
||||
) -> dict:
|
||||
"""Run automatic classification for a project.
|
||||
|
||||
Args:
|
||||
progress_callback: Optional callable(scanned, matched, total) to report progress.
|
||||
"""
|
||||
project = db.query(ClassificationProject).filter(ClassificationProject.id == project_id).first()
|
||||
if not project:
|
||||
return {"success": False, "message": "项目不存在"}
|
||||
@@ -82,7 +110,10 @@ def run_auto_classification(db: Session, project_id: int, source_ids: Optional[L
|
||||
columns = columns_query.all()
|
||||
|
||||
matched_count = 0
|
||||
for col in columns:
|
||||
total = len(columns)
|
||||
report_interval = max(1, total // 20) # report ~20 times
|
||||
|
||||
for idx, col in enumerate(columns):
|
||||
# Check if already has a result for this project
|
||||
existing = db.query(ClassificationResult).filter(
|
||||
ClassificationResult.project_id == project_id,
|
||||
@@ -121,12 +152,20 @@ def run_auto_classification(db: Session, project_id: int, source_ids: Optional[L
|
||||
# Increment hit count
|
||||
best_rule.hit_count = (best_rule.hit_count or 0) + 1
|
||||
|
||||
# Report progress periodically
|
||||
if progress_callback and (idx + 1) % report_interval == 0:
|
||||
progress_callback(scanned=idx + 1, matched=matched_count, total=total)
|
||||
|
||||
db.commit()
|
||||
|
||||
# Final progress report
|
||||
if progress_callback:
|
||||
progress_callback(scanned=total, matched=matched_count, total=total)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"message": f"自动分类完成,共扫描 {len(columns)} 个字段,命中 {matched_count} 个",
|
||||
"scanned": len(columns),
|
||||
"message": f"自动分类完成,共扫描 {total} 个字段,命中 {matched_count} 个",
|
||||
"scanned": total,
|
||||
"matched": matched_count,
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user