Source code for voxatlas.features.lexical.frequency.word_frequency
import pandas as pd
from voxatlas.features.base_extractor import BaseExtractor
from voxatlas.features.feature_output import ScalarFeatureOutput
from voxatlas.registry.feature_registry import registry
[docs]
class WordFrequencyExtractor(BaseExtractor):
r"""
Extract the ``lexical.frequency.word`` feature within the VoxAtlas pipeline.
This public extractor defines the reusable API for computing ``lexical.frequency.word`` from VoxAtlas structured inputs. It consumes ``token`` units and produces values aligned to ``token`` units, making the extractor a stable pipeline node that can be cited independently of the surrounding execution machinery.
Algorithm
---------
The extractor follows the standard VoxAtlas feature-computation pattern.
1. Input preparation
Structured audio, unit tables, and dependency outputs are gathered from ``feature_input``.
2. Feature-specific computation
The implementation applies the domain-specific transformation required by this extractor.
3. Packaging
Results are aligned to ``token`` units and returned as a ``FeatureOutput`` object.
Notes
-----
This extractor declares the upstream dependencies ['lexical.frequency.lookup'] and is executed only after those features are available in the pipeline feature store.
Examples
--------
>>> import pandas as pd
>>> from voxatlas.features.feature_input import FeatureInput
>>> from voxatlas.features.feature_output import TableFeatureOutput
>>> from voxatlas.features.lexical.frequency.word_frequency import WordFrequencyExtractor
>>> from voxatlas.pipeline.feature_store import FeatureStore
>>> table = pd.DataFrame({"id": [1], "frequency": [10.0]})
>>> store = FeatureStore()
>>> store.add("lexical.frequency.lookup", TableFeatureOutput(feature="lexical.frequency.lookup", unit="token", values=table))
>>> out = WordFrequencyExtractor().compute(FeatureInput(audio=None, units=None, context={"feature_store": store}), {})
>>> float(out.values.loc[1])
10.0
"""
name = "lexical.frequency.word"
input_units = "token"
output_units = "token"
dependencies = ["lexical.frequency.lookup"]
default_config = {}
[docs]
def compute(self, feature_input, params):
"""
Compute raw token-level frequency values from the lookup table.
Parameters
----------
feature_input : FeatureInput
Prepared stream input containing the feature store.
params : dict
Resolved extractor configuration. Present for API consistency.
Returns
-------
ScalarFeatureOutput
Token-aligned raw frequency values.
Raises
------
KeyError
Raised when the lexical lookup dependency is unavailable.
Notes
-----
The output index matches the token ids from the lookup dependency.
Examples
--------
>>> import pandas as pd
>>> from voxatlas.features.feature_input import FeatureInput
>>> from voxatlas.features.feature_output import TableFeatureOutput
>>> from voxatlas.features.lexical.frequency.word_frequency import WordFrequencyExtractor
>>> from voxatlas.pipeline.feature_store import FeatureStore
>>> table = pd.DataFrame({"id": [1], "frequency": [10.0]})
>>> store = FeatureStore()
>>> store.add("lexical.frequency.lookup", TableFeatureOutput(feature="lexical.frequency.lookup", unit="token", values=table))
>>> result = WordFrequencyExtractor().compute(FeatureInput(audio=None, units=None, context={"feature_store": store}), {})
>>> result.unit
'token'
"""
table = feature_input.context["feature_store"].get(
"lexical.frequency.lookup"
).values
values = pd.Series(
table["frequency"].astype("float32").values,
index=table["id"],
)
return ScalarFeatureOutput(
feature=self.name,
unit="token",
values=values,
)
registry.register(WordFrequencyExtractor)