Source code for dbs_annotator.utils.tsv_columns

"""Canonical TSV column names and legacy alias handling."""

from __future__ import annotations

from typing import Any

import pandas as pd

BLOCK_ID_COLUMN = "block_ID"
BLOCK_ID_LEGACY_KEYS = ("block_id", "blockId", "blockID")


[docs] def block_id_from_row(row: dict[str, Any]) -> Any: """Return the block index from a ``csv.DictReader`` row. Accepts legacy headers (``block_id``, ``blockId``, ``blockID``) and the canonical ``block_ID`` column name. """ for key in (BLOCK_ID_COLUMN, *BLOCK_ID_LEGACY_KEYS): val = row.get(key) if val is not None and val != "": return val return None
[docs] def normalize_block_id_dataframe(df: pd.DataFrame | None) -> pd.DataFrame | None: """Rename legacy block-index columns to ``block_ID``.""" if df is None or df.empty: return df if BLOCK_ID_COLUMN in df.columns: return df for candidate in BLOCK_ID_LEGACY_KEYS: if candidate in df.columns: return df.rename(columns={candidate: BLOCK_ID_COLUMN}) return df
[docs] def read_session_tsv(path: str) -> pd.DataFrame: """Read a programming-session TSV and normalize column names.""" df = pd.read_csv(path, sep="\t", na_filter=False) normalized = normalize_block_id_dataframe(df) return df if normalized is None else normalized
[docs] def canonicalize_row_block_id(row: dict[str, Any]) -> dict[str, Any]: """Ensure the row dict uses ``block_ID`` (drop legacy keys).""" out = dict(row) bid = block_id_from_row(row) if bid is not None: out[BLOCK_ID_COLUMN] = bid for key in BLOCK_ID_LEGACY_KEYS: out.pop(key, None) return out
[docs] def normalize_tsv_fieldnames(fieldnames: list[str] | None) -> list[str]: """Map legacy ``block_id`` header to ``block_ID`` (once per file).""" if not fieldnames: return [] normalized: list[str] = [] block_column_added = False for name in fieldnames: if name in (BLOCK_ID_COLUMN, *BLOCK_ID_LEGACY_KEYS): if not block_column_added: normalized.append(BLOCK_ID_COLUMN) block_column_added = True continue normalized.append(name) return normalized