import re from typing import List, Optional from sqlalchemy.orm import Session from app.models.lineage import DataLineage def _extract_tables(sql: str) -> List[str]: """Extract table names from SQL using regex (simple heuristic).""" # Normalize SQL sql = re.sub(r"--.*?\n", " ", sql) sql = re.sub(r"/\*.*?\*/", " ", sql, flags=re.DOTALL) sql = sql.lower() tables = set() # FROM / JOIN / INTO for pattern in [r"\bfrom\s+([a-z_][a-z0-9_]*)", r"\bjoin\s+([a-z_][a-z0-9_]*)"]: for m in re.finditer(pattern, sql): tables.add(m.group(1)) return sorted(tables) def parse_sql_lineage(db: Session, sql: str, target_table: str) -> List[DataLineage]: """Parse SQL and create lineage records pointing to target_table.""" source_tables = _extract_tables(sql) records = [] for st in source_tables: if st == target_table: continue existing = db.query(DataLineage).filter( DataLineage.source_table == st, DataLineage.target_table == target_table, ).first() if not existing: rec = DataLineage( source_table=st, target_table=target_table, relation_type="direct", script_content=sql[:2000], ) db.add(rec) records.append(rec) db.commit() return records def get_lineage_graph(db: Session, table_name: Optional[str] = None) -> dict: """Build graph data for ECharts.""" query = db.query(DataLineage) if table_name: query = query.filter( (DataLineage.source_table == table_name) | (DataLineage.target_table == table_name) ) items = query.limit(500).all() nodes = {} links = [] for item in items: nodes[item.source_table] = {"name": item.source_table, "category": 0} nodes[item.target_table] = {"name": item.target_table, "category": 1} links.append({"source": item.source_table, "target": item.target_table, "value": item.relation_type}) return { "nodes": list(nodes.values()), "links": links, }