135 lines
4.7 KiB
Python
135 lines
4.7 KiB
Python
import re
|
|
import json
|
|
from typing import List, Optional, Tuple
|
|
from sqlalchemy.orm import Session
|
|
|
|
from app.models.classification import RecognitionRule, Category, DataLevel
|
|
from app.models.metadata import DataColumn, DataTable
|
|
from app.models.project import ClassificationProject, ClassificationResult, ResultStatus
|
|
|
|
|
|
def match_rule(rule: RecognitionRule, column: DataColumn) -> Tuple[bool, float]:
|
|
"""Match a single rule against a column. Returns (matched, confidence)."""
|
|
targets = []
|
|
if rule.target_field == "column_name":
|
|
targets = [column.name]
|
|
elif rule.target_field == "comment":
|
|
targets = [column.comment or ""]
|
|
elif rule.target_field == "sample_data":
|
|
targets = []
|
|
if column.sample_data:
|
|
try:
|
|
samples = json.loads(column.sample_data)
|
|
if isinstance(samples, list):
|
|
targets = [str(s) for s in samples]
|
|
except Exception:
|
|
targets = [column.sample_data]
|
|
|
|
if not targets:
|
|
return False, 0.0
|
|
|
|
if rule.rule_type == "regex":
|
|
try:
|
|
pattern = re.compile(rule.rule_content)
|
|
for t in targets:
|
|
if pattern.search(t):
|
|
return True, 0.85
|
|
except re.error:
|
|
return False, 0.0
|
|
|
|
elif rule.rule_type == "keyword":
|
|
keywords = [k.strip().lower() for k in rule.rule_content.split(",")]
|
|
for t in targets:
|
|
t_lower = t.lower()
|
|
for kw in keywords:
|
|
if kw in t_lower:
|
|
return True, 0.75
|
|
|
|
elif rule.rule_type == "enum":
|
|
enums = [e.strip().lower() for e in rule.rule_content.split(",")]
|
|
for t in targets:
|
|
if t.strip().lower() in enums:
|
|
return True, 0.90
|
|
|
|
return False, 0.0
|
|
|
|
|
|
def run_auto_classification(db: Session, project_id: int, source_ids: Optional[List[int]] = None) -> dict:
|
|
"""Run automatic classification for a project."""
|
|
project = db.query(ClassificationProject).filter(ClassificationProject.id == project_id).first()
|
|
if not project:
|
|
return {"success": False, "message": "项目不存在"}
|
|
|
|
# Get active rules from project's template
|
|
rules = db.query(RecognitionRule).filter(
|
|
RecognitionRule.is_active == True,
|
|
RecognitionRule.template_id == project.template_id,
|
|
).order_by(RecognitionRule.priority).all()
|
|
|
|
if not rules:
|
|
return {"success": False, "message": "没有可用的识别规则"}
|
|
|
|
# Get columns to classify
|
|
from app.services.metadata_service import list_tables, list_columns
|
|
|
|
columns_query = db.query(DataColumn).join(DataTable).join(app.models.metadata.Database)
|
|
if source_ids:
|
|
columns_query = columns_query.filter(app.models.metadata.Database.source_id.in_(source_ids))
|
|
elif project.target_source_ids:
|
|
sids = [int(x) for x in project.target_source_ids.split(",") if x]
|
|
columns_query = columns_query.filter(app.models.metadata.Database.source_id.in_(sids))
|
|
|
|
columns = columns_query.all()
|
|
|
|
matched_count = 0
|
|
for col in columns:
|
|
# Check if already has a result for this project
|
|
existing = db.query(ClassificationResult).filter(
|
|
ClassificationResult.project_id == project_id,
|
|
ClassificationResult.column_id == col.id,
|
|
).first()
|
|
|
|
best_rule = None
|
|
best_confidence = 0.0
|
|
|
|
for rule in rules:
|
|
matched, confidence = match_rule(rule, col)
|
|
if matched and confidence > best_confidence:
|
|
best_confidence = confidence
|
|
best_rule = rule
|
|
|
|
if best_rule:
|
|
matched_count += 1
|
|
if existing:
|
|
existing.category_id = best_rule.category_id
|
|
existing.level_id = best_rule.level_id
|
|
existing.confidence = best_confidence
|
|
existing.source = "auto"
|
|
existing.status = ResultStatus.AUTO.value
|
|
else:
|
|
result = ClassificationResult(
|
|
project_id=project_id,
|
|
column_id=col.id,
|
|
category_id=best_rule.category_id,
|
|
level_id=best_rule.level_id,
|
|
source="auto",
|
|
confidence=best_confidence,
|
|
status=ResultStatus.AUTO.value,
|
|
)
|
|
db.add(result)
|
|
|
|
# Increment hit count
|
|
best_rule.hit_count = (best_rule.hit_count or 0) + 1
|
|
|
|
db.commit()
|
|
|
|
return {
|
|
"success": True,
|
|
"message": f"自动分类完成,共扫描 {len(columns)} 个字段,命中 {matched_count} 个",
|
|
"scanned": len(columns),
|
|
"matched": matched_count,
|
|
}
|
|
|
|
|
|
import app.models.metadata
|