feat: initial commit - Phase 1 & 2 core features

This commit is contained in:
hiderfong
2026-04-22 17:07:33 +08:00
commit 1773bda06b
25005 changed files with 6252106 additions and 0 deletions
@@ -0,0 +1,134 @@
import re
import json
from typing import List, Optional, Tuple
from sqlalchemy.orm import Session
from app.models.classification import RecognitionRule, Category, DataLevel
from app.models.metadata import DataColumn, DataTable
from app.models.project import ClassificationProject, ClassificationResult, ResultStatus
def match_rule(rule: RecognitionRule, column: DataColumn) -> Tuple[bool, float]:
"""Match a single rule against a column. Returns (matched, confidence)."""
targets = []
if rule.target_field == "column_name":
targets = [column.name]
elif rule.target_field == "comment":
targets = [column.comment or ""]
elif rule.target_field == "sample_data":
targets = []
if column.sample_data:
try:
samples = json.loads(column.sample_data)
if isinstance(samples, list):
targets = [str(s) for s in samples]
except Exception:
targets = [column.sample_data]
if not targets:
return False, 0.0
if rule.rule_type == "regex":
try:
pattern = re.compile(rule.rule_content)
for t in targets:
if pattern.search(t):
return True, 0.85
except re.error:
return False, 0.0
elif rule.rule_type == "keyword":
keywords = [k.strip().lower() for k in rule.rule_content.split(",")]
for t in targets:
t_lower = t.lower()
for kw in keywords:
if kw in t_lower:
return True, 0.75
elif rule.rule_type == "enum":
enums = [e.strip().lower() for e in rule.rule_content.split(",")]
for t in targets:
if t.strip().lower() in enums:
return True, 0.90
return False, 0.0
def run_auto_classification(db: Session, project_id: int, source_ids: Optional[List[int]] = None) -> dict:
"""Run automatic classification for a project."""
project = db.query(ClassificationProject).filter(ClassificationProject.id == project_id).first()
if not project:
return {"success": False, "message": "项目不存在"}
# Get active rules from project's template
rules = db.query(RecognitionRule).filter(
RecognitionRule.is_active == True,
RecognitionRule.template_id == project.template_id,
).order_by(RecognitionRule.priority).all()
if not rules:
return {"success": False, "message": "没有可用的识别规则"}
# Get columns to classify
from app.services.metadata_service import list_tables, list_columns
columns_query = db.query(DataColumn).join(DataTable).join(app.models.metadata.Database)
if source_ids:
columns_query = columns_query.filter(app.models.metadata.Database.source_id.in_(source_ids))
elif project.target_source_ids:
sids = [int(x) for x in project.target_source_ids.split(",") if x]
columns_query = columns_query.filter(app.models.metadata.Database.source_id.in_(sids))
columns = columns_query.all()
matched_count = 0
for col in columns:
# Check if already has a result for this project
existing = db.query(ClassificationResult).filter(
ClassificationResult.project_id == project_id,
ClassificationResult.column_id == col.id,
).first()
best_rule = None
best_confidence = 0.0
for rule in rules:
matched, confidence = match_rule(rule, col)
if matched and confidence > best_confidence:
best_confidence = confidence
best_rule = rule
if best_rule:
matched_count += 1
if existing:
existing.category_id = best_rule.category_id
existing.level_id = best_rule.level_id
existing.confidence = best_confidence
existing.source = "auto"
existing.status = ResultStatus.AUTO.value
else:
result = ClassificationResult(
project_id=project_id,
column_id=col.id,
category_id=best_rule.category_id,
level_id=best_rule.level_id,
source="auto",
confidence=best_confidence,
status=ResultStatus.AUTO.value,
)
db.add(result)
# Increment hit count
best_rule.hit_count = (best_rule.hit_count or 0) + 1
db.commit()
return {
"success": True,
"message": f"自动分类完成,共扫描 {len(columns)} 个字段,命中 {matched_count}",
"scanned": len(columns),
"matched": matched_count,
}
import app.models.metadata