Source code for voxatlas.features.morphology.agreement.features

import pandas as pd

from voxatlas.features.base_extractor import BaseExtractor
from voxatlas.features.feature_output import TableFeatureOutput
from voxatlas.registry.feature_registry import registry
from voxatlas.syntax.agreement_utils import extract_agreement_features



[docs]
class AgreementFeaturesExtractor(BaseExtractor):
    r"""
    Extract the ``morphology.agreement.features`` feature within the VoxAtlas pipeline.
    
    This public extractor defines the reusable API for computing ``morphology.agreement.features`` from VoxAtlas structured inputs. It consumes ``token`` units and produces values aligned to ``token`` units, making the extractor a stable pipeline node that can be cited independently of the surrounding execution machinery.
    
    Algorithm
    ---------
    The extractor projects morphological annotations or derived segmentation features onto the token index.
    
    1. Morphological preparation
       Token-level annotations or derived morphological resources are loaded from the dependency graph.
    
    2. Feature computation
       Depending on the extractor, the output is a categorical label, a binary indicator :math:`\mathbf{1}[\cdot]`, or a count such as :math:`N_i^{morpheme}`.
    
    3. Packaging
       The result is returned as a token-aligned scalar series so later discourse-level aggregation can preserve speaker and timing metadata.
    
    Notes
    -----
    This extractor declares the upstream dependencies ['morphology.inflection.features'] and is executed only after those features are available in the pipeline feature store.
    
    Examples
    --------
    >>> import pandas as pd
    >>> from voxatlas.features.feature_input import FeatureInput
    >>> from voxatlas.features.feature_output import TableFeatureOutput
    >>> from voxatlas.features.morphology.agreement.features import AgreementFeaturesExtractor
    >>> from voxatlas.pipeline.feature_store import FeatureStore
    >>> inflection = pd.DataFrame(
    ...     [
    ...         {"id": 1, "head": 2, "dep_rel": "nsubj", "Person": 3, "Number": "Sing"},
    ...         {"id": 2, "head": 0, "dep_rel": "root", "Person": 3, "Number": "Sing"},
    ...     ]
    ... )
    >>> store = FeatureStore()
    >>> store.add("morphology.inflection.features", TableFeatureOutput(feature="morphology.inflection.features", unit="token", values=inflection))
    >>> feature_input = FeatureInput(audio=None, units=None, context={"feature_store": store})
    >>> out = AgreementFeaturesExtractor().compute(feature_input, {})
    >>> list(map(float, out.values["SubjectVerbAgreement"].tolist()))
    [1.0, 1.0]
    """
    name = "morphology.agreement.features"
    input_units = "token"
    output_units = "token"
    dependencies = ["morphology.inflection.features"]
    default_config = {}


[docs]
    def compute(self, feature_input, params):
        """
        Compute the extractor output for a single pipeline invocation.
        
        This method is the reusable execution entry point for the extractor. It receives the standard ``FeatureInput`` bundle, applies the configured algorithm, and returns feature values aligned to the extractor output units for storage in the pipeline feature store.
        
        Parameters
        ----------
        feature_input : object
            Structured extractor input bundling audio, hierarchical units, and execution context for this feature computation.
        params : object
            Resolved feature configuration for this invocation. Keys are feature-specific and merged from defaults and pipeline settings.
        
        Returns
        -------
        FeatureOutput
            Structured output aligned to the ``token`` unit level when applicable.
        
        Examples
        --------
        >>> import pandas as pd
        >>> from voxatlas.features.feature_input import FeatureInput
        >>> from voxatlas.features.feature_output import TableFeatureOutput
        >>> from voxatlas.features.morphology.agreement.features import AgreementFeaturesExtractor
        >>> from voxatlas.pipeline.feature_store import FeatureStore
        >>> inflection = pd.DataFrame(
        ...     [
        ...         {"id": 1, "head": 2, "dep_rel": "nsubj", "Person": 3, "Number": "Sing"},
        ...         {"id": 2, "head": 0, "dep_rel": "root", "Person": 3, "Number": "Sing"},
        ...     ]
        ... )
        >>> store = FeatureStore()
        >>> store.add("morphology.inflection.features", TableFeatureOutput(feature="morphology.inflection.features", unit="token", values=inflection))
        >>> feature_input = FeatureInput(audio=None, units=None, context={"feature_store": store})
        >>> result = AgreementFeaturesExtractor().compute(feature_input, {})
        >>> result.unit
        'token'
        """
        inflection_table = feature_input.context["feature_store"].get(
            "morphology.inflection.features"
        ).values
        agreement_table = extract_agreement_features(inflection_table)
        merged = pd.concat(
            [
                inflection_table.reset_index(drop=True),
                agreement_table.drop(columns=["id"], errors="ignore"),
            ],
            axis=1,
        )

        return TableFeatureOutput(
            feature=self.name,
            unit="token",
            values=merged,
        )




registry.register(AgreementFeaturesExtractor)