diff --git a/docs/api-reference/application.md b/docs/api-reference/application.md
index f66e0c08e..187faaacc 100644
--- a/docs/api-reference/application.md
+++ b/docs/api-reference/application.md
@@ -10,7 +10,7 @@
class Application()
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/app.py#L55)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/app.py#L59)
The main Application class.
@@ -62,7 +62,7 @@ def __init__(broker_address: Optional[str] = None,
quix_sdk_token: Optional[str] = None,
consumer_group: Optional[str] = None,
auto_offset_reset: AutoOffsetReset = "latest",
- auto_commit_enable: bool = True,
+ commit_interval: float = 5.0,
partitioner: Partitioner = "murmur2",
consumer_extra_config: Optional[dict] = None,
producer_extra_config: Optional[dict] = None,
@@ -81,7 +81,7 @@ def __init__(broker_address: Optional[str] = None,
topic_manager: Optional[TopicManager] = None)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/app.py#L93)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/app.py#L97)
@@ -102,9 +102,9 @@ Passed as `group.id` to `confluent_kafka.Consumer`.
Linked Environment Variable: `Quix__Consumer__Group`.
Default - "quixstreams-default" (set during init)
>***NOTE:*** Quix Applications will prefix it with the Quix workspace id.
+- `commit_interval`: How often to commit the processed messages in seconds.
+Default - 5.0.
- `auto_offset_reset`: Consumer `auto.offset.reset` setting
-- `auto_commit_enable`: If true, periodically commit offset of
-the last message handed to the application. Default - `True`.
- `partitioner`: A function to be used to determine the outgoing message
partition.
- `consumer_extra_config`: A dictionary with additional options that
@@ -157,7 +157,6 @@ instead of the default one.
def Quix(cls,
consumer_group: Optional[str] = None,
auto_offset_reset: AutoOffsetReset = "latest",
- auto_commit_enable: bool = True,
partitioner: Partitioner = "murmur2",
consumer_extra_config: Optional[dict] = None,
producer_extra_config: Optional[dict] = None,
@@ -176,7 +175,7 @@ def Quix(cls,
topic_manager: Optional[QuixTopicManager] = None) -> Self
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/app.py#L296)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/app.py#L303)
>***NOTE:*** DEPRECATED: use Application with `quix_sdk_token` argument instead.
@@ -224,8 +223,6 @@ Linked Environment Variable: `Quix__Consumer__Group`.
Default - "quixstreams-default" (set during init).
>***NOTE:*** Quix Applications will prefix it with the Quix workspace id.
- `auto_offset_reset`: Consumer `auto.offset.reset` setting
-- `auto_commit_enable`: If true, periodically commit offset of
-the last message handed to the application. Default - `True`.
- `partitioner`: A function to be used to determine the outgoing message
partition.
- `consumer_extra_config`: A dictionary with additional options that
@@ -288,7 +285,7 @@ def topic(name: str,
timestamp_extractor: Optional[TimestampExtractor] = None) -> Topic
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/app.py#L436)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/app.py#L439)
Create a topic definition.
@@ -369,7 +366,7 @@ topic = app.topic("input-topic", timestamp_extractor=custom_ts_extractor)
def dataframe(topic: Topic) -> StreamingDataFrame
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/app.py#L516)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/app.py#L519)
A simple helper method that generates a `StreamingDataFrame`, which is used
@@ -416,10 +413,10 @@ to be used as an input topic.
#### Application.stop
```python
-def stop()
+def stop(fail: bool = False)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/app.py#L552)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/app.py#L556)
Stop the internal poll loop and the message processing.
@@ -429,6 +426,13 @@ likely through some sort of threading).
To otherwise stop an application, either send a `SIGTERM` to the process
(like Kubernetes does) or perform a typical `KeyboardInterrupt` (`Ctrl+C`).
+
+
+***Arguments:***
+
+- `fail`: if True, signals that application is stopped due
+to unhandled exception, and it shouldn't commit the current checkpoint.
+
@@ -439,7 +443,7 @@ To otherwise stop an application, either send a `SIGTERM` to the process
def get_producer() -> Producer
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/app.py#L566)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/app.py#L579)
Create and return a pre-configured Producer instance.
The Producer is initialized with params passed to Application.
@@ -471,10 +475,10 @@ with app.get_producer() as producer:
#### Application.get\_consumer
```python
-def get_consumer() -> Consumer
+def get_consumer(auto_commit_enable: bool = True) -> Consumer
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/app.py#L597)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/app.py#L610)
Create and return a pre-configured Consumer instance.
The Consumer is initialized with params passed to Application.
@@ -519,7 +523,7 @@ with app.get_consumer() as consumer:
def clear_state()
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/app.py#L641)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/app.py#L654)
Clear the state of the application.
@@ -533,11 +537,11 @@ Clear the state of the application.
def run(dataframe: StreamingDataFrame)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/app.py#L719)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/app.py#L660)
Start processing data from Kafka using provided `StreamingDataFrame`
-One started, can be safely terminated with a `SIGTERM` signal
+Once started, it can be safely terminated with a `SIGTERM` signal
(like Kubernetes does) or a typical `KeyboardInterrupt` (`Ctrl+C`).
diff --git a/docs/api-reference/context.md b/docs/api-reference/context.md
index 77be373b9..bbb1a800e 100644
--- a/docs/api-reference/context.md
+++ b/docs/api-reference/context.md
@@ -12,7 +12,7 @@
def set_message_context(context: Optional[MessageContext])
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/context.py#L21)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/context.py#L21)
Set a MessageContext for the current message in the given `contextvars.Context`
@@ -55,7 +55,7 @@ sdf = sdf.update(lambda value: alter_context(value))
def message_context() -> MessageContext
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/context.py#L52)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/context.py#L52)
Get a MessageContext for the current message, which houses most of the message
@@ -96,7 +96,7 @@ instance of `MessageContext`
def message_key() -> Any
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/context.py#L83)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/context.py#L83)
Get the current message's key.
diff --git a/docs/api-reference/dataframe.md b/docs/api-reference/dataframe.md
index 67e0c57f8..245c44b2f 100644
--- a/docs/api-reference/dataframe.md
+++ b/docs/api-reference/dataframe.md
@@ -10,7 +10,7 @@
class StreamingDataFrame(BaseStreaming)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/dataframe.py#L32)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/dataframe.py#L31)
`StreamingDataFrame` is the main object you will use for ETL work.
@@ -74,7 +74,7 @@ def apply(func: Union[DataFrameFunc, DataFrameStatefulFunc],
expand: bool = False) -> Self
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/dataframe.py#L109)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/dataframe.py#L108)
Apply a function to transform the value and return a new value.
@@ -122,7 +122,7 @@ def update(func: Union[DataFrameFunc, DataFrameStatefulFunc],
stateful: bool = False) -> Self
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/dataframe.py#L152)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/dataframe.py#L151)
Apply a function to mutate value in-place or to perform a side effect
@@ -170,7 +170,7 @@ def filter(func: Union[DataFrameFunc, DataFrameStatefulFunc],
stateful: bool = False) -> Self
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/dataframe.py#L191)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/dataframe.py#L190)
Filter value using provided function.
@@ -218,7 +218,7 @@ of type `State` to perform stateful operations.
def contains(key: str) -> StreamingSeries
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/dataframe.py#L244)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/dataframe.py#L233)
Check if the key is present in the Row value.
@@ -258,7 +258,7 @@ def to_topic(topic: Topic,
key: Optional[Callable[[object], object]] = None) -> Self
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/dataframe.py#L267)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/dataframe.py#L256)
Produce current value to a topic. You can optionally specify a new key.
@@ -306,7 +306,7 @@ By default, the current message key will be used.
def compose() -> StreamCallable
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/dataframe.py#L306)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/dataframe.py#L295)
Compose all functions of this StreamingDataFrame into one big closure.
@@ -349,7 +349,7 @@ and returns a result of StreamingDataFrame
def test(value: object, ctx: Optional[MessageContext] = None) -> Any
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/dataframe.py#L336)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/dataframe.py#L325)
A shorthand to test `StreamingDataFrame` with provided value
@@ -383,7 +383,7 @@ def tumbling_window(duration_ms: Union[int, timedelta],
name: Optional[str] = None) -> TumblingWindowDefinition
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/dataframe.py#L354)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/dataframe.py#L343)
Create a tumbling window transformation on this StreamingDataFrame.
@@ -468,7 +468,7 @@ def hopping_window(duration_ms: Union[int, timedelta],
name: Optional[str] = None) -> HoppingWindowDefinition
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/dataframe.py#L429)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/dataframe.py#L418)
Create a hopping window transformation on this StreamingDataFrame.
@@ -561,7 +561,7 @@ sdf = (
class StreamingSeries(BaseStreaming)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/series.py#L17)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/series.py#L40)
`StreamingSeries` are typically generated by `StreamingDataframes` when getting
elements from, or performing certain operations on, a `StreamingDataframe`,
@@ -627,7 +627,7 @@ sdf = sdf[["column_a"] & (sdf["new_sum_field"] >= 10)]
def from_func(cls, func: StreamCallable) -> Self
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/series.py#L77)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/series.py#L100)
Create a StreamingSeries from a function.
@@ -655,7 +655,7 @@ instance of `StreamingSeries`
def apply(func: StreamCallable) -> Self
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/series.py#L91)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/series.py#L114)
Add a callable to the execution list for this series.
@@ -708,7 +708,7 @@ def compose(allow_filters: bool = True,
allow_updates: bool = True) -> StreamCallable
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/series.py#L125)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/series.py#L148)
Compose all functions of this StreamingSeries into one big closure.
@@ -768,7 +768,7 @@ and returns a result of `StreamingSeries`
def test(value: Any, ctx: Optional[MessageContext] = None) -> Any
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/series.py#L172)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/series.py#L195)
A shorthand to test `StreamingSeries` with provided value
@@ -800,7 +800,7 @@ result of `StreamingSeries`
def isin(other: Container) -> Self
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/series.py#L208)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/series.py#L231)
Check if series value is in "other".
@@ -845,7 +845,7 @@ new StreamingSeries
def contains(other: Union[Self, object]) -> Self
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/series.py#L235)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/series.py#L258)
Check if series value contains "other"
@@ -890,7 +890,7 @@ new StreamingSeries
def is_(other: Union[Self, object]) -> Self
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/series.py#L260)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/series.py#L283)
Check if series value refers to the same object as `other`
@@ -932,7 +932,7 @@ new StreamingSeries
def isnot(other: Union[Self, object]) -> Self
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/series.py#L283)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/series.py#L306)
Check if series value does not refer to the same object as `other`
@@ -975,7 +975,7 @@ new StreamingSeries
def isnull() -> Self
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/series.py#L307)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/series.py#L330)
Check if series value is None.
@@ -1012,7 +1012,7 @@ new StreamingSeries
def notnull() -> Self
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/series.py#L330)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/series.py#L353)
Check if series value is not None.
@@ -1049,7 +1049,7 @@ new StreamingSeries
def abs() -> Self
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/series.py#L353)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/series.py#L376)
Get absolute value of the series value.
diff --git a/docs/api-reference/kafka.md b/docs/api-reference/kafka.md
index 986e1e6fe..5a8e93b16 100644
--- a/docs/api-reference/kafka.md
+++ b/docs/api-reference/kafka.md
@@ -10,7 +10,7 @@
class Producer()
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/producer.py#L54)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/producer.py#L37)
@@ -24,7 +24,7 @@ def __init__(broker_address: str,
extra_config: Optional[dict] = None)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/producer.py#L55)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/producer.py#L38)
A wrapper around `confluent_kafka.Producer`.
@@ -61,14 +61,15 @@ def produce(topic: str,
partition: Optional[int] = None,
timestamp: Optional[int] = None,
poll_timeout: float = 5.0,
- buffer_error_max_tries: int = 3)
+ buffer_error_max_tries: int = 3,
+ on_delivery: Optional[DeliveryCallback] = None)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/producer.py#L94)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/producer.py#L74)
-Produce message to topic.
+Produce a message to a topic.
-It also polls Kafka for callbacks before producing in order to minimize
+It also polls Kafka for callbacks before producing to minimize
the probability of `BufferError`.
If `BufferError` still happens, the method will poll Kafka with timeout
to free up the buffer and try again.
@@ -86,6 +87,8 @@ to free up the buffer and try again.
- `poll_timeout`: timeout for `poll()` call in case of `BufferError`
- `buffer_error_max_tries`: max retries for `BufferError`.
Pass `0` to not retry after `BufferError`.
+- `on_delivery`: the delivery callback to be triggered on `poll()`
+for the produced message.
@@ -97,7 +100,7 @@ Pass `0` to not retry after `BufferError`.
def poll(timeout: float = 0)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/producer.py#L152)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/producer.py#L135)
Polls the producer for events and calls `on_delivery` callbacks.
@@ -118,7 +121,7 @@ Polls the producer for events and calls `on_delivery` callbacks.
def flush(timeout: Optional[float] = None) -> int
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/producer.py#L160)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/producer.py#L143)
Wait for all messages in the Producer queue to be delivered.
@@ -147,7 +150,7 @@ number of messages remaining to flush
class Consumer()
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L66)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L66)
@@ -166,7 +169,7 @@ def __init__(broker_address: str,
extra_config: Optional[dict] = None)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L67)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L67)
A wrapper around `confluent_kafka.Consumer`.
@@ -208,7 +211,7 @@ Note: values passed as arguments override values in `extra_config`.
def poll(timeout: Optional[float] = None) -> Optional[Message]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L126)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L126)
Consumes a single message, calls callbacks and returns events.
@@ -249,7 +252,7 @@ def subscribe(topics: List[str],
on_lost: Optional[RebalancingCallback] = None)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L144)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L144)
Set subscription to supplied list of topics
@@ -292,7 +295,7 @@ for example, may fail.
def unsubscribe()
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L238)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L238)
Remove current subscription.
@@ -312,7 +315,7 @@ def store_offsets(message: Optional[Message] = None,
offsets: Optional[List[TopicPartition]] = None)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L246)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L246)
.. py:function:: store_offsets([message=None], [offsets=None])
@@ -347,7 +350,7 @@ def commit(message: Optional[Message] = None,
asynchronous: bool = True) -> Optional[List[TopicPartition]]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L280)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L280)
Commit a message or a list of offsets.
@@ -385,7 +388,7 @@ def committed(partitions: List[TopicPartition],
timeout: Optional[float] = None) -> List[TopicPartition]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L320)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L320)
.. py:function:: committed(partitions, [timeout=None])
@@ -422,7 +425,7 @@ def get_watermark_offsets(partition: TopicPartition,
cached: bool = False) -> Tuple[int, int]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L340)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L340)
Retrieve low and high offsets for the specified partition.
@@ -461,7 +464,7 @@ def list_topics(topic: Optional[str] = None,
timeout: Optional[float] = None) -> ClusterMetadata
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L366)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L366)
.. py:function:: list_topics([topic=None], [timeout=-1])
@@ -494,7 +497,7 @@ None or -1 is infinite. Default: None
def memberid() -> str
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L389)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L389)
Return this client's broker-assigned group member id.
@@ -517,7 +520,7 @@ def offsets_for_times(partitions: List[TopicPartition],
timeout: Optional[float] = None) -> List[TopicPartition]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L402)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L402)
Look up offsets by timestamp for the specified partitions.
@@ -546,7 +549,7 @@ last message in the partition, a value of -1 will be returned.
def pause(partitions: List[TopicPartition])
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L428)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L428)
Pause consumption for the provided list of partitions.
@@ -574,7 +577,7 @@ Does NOT affect the result of Consumer.assignment().
def resume(partitions: List[TopicPartition])
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L442)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L442)
.. py:function:: resume(partitions)
@@ -600,7 +603,7 @@ Resume consumption for the provided list of partitions.
def position(partitions: List[TopicPartition]) -> List[TopicPartition]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L454)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L454)
Retrieve current positions (offsets) for the specified partitions.
@@ -633,7 +636,7 @@ the last consumed message + 1.
def seek(partition: TopicPartition)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L468)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L468)
Set consume position for partition to offset.
@@ -665,7 +668,7 @@ pass the offset in an `assign()` call.
def assignment() -> List[TopicPartition]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L485)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L485)
Returns the current partition assignment.
@@ -690,7 +693,7 @@ Returns the current partition assignment.
def set_sasl_credentials(username: str, password: str)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L498)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L498)
Sets the SASL credentials used for this client.
These credentials will overwrite the old ones, and will be used the next
@@ -709,7 +712,7 @@ This method is applicable only to SASL PLAIN and SCRAM mechanisms.
def incremental_assign(partitions: List[TopicPartition])
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L510)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L510)
Assign new partitions.
@@ -729,7 +732,7 @@ Any additional partitions besides the ones passed during the `Consumer`
def incremental_unassign(partitions: List[TopicPartition])
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L522)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L522)
Revoke partitions.
@@ -745,7 +748,7 @@ Can be called outside an on_revoke callback.
def close()
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L530)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L530)
Close down and terminate the Kafka Consumer.
diff --git a/docs/api-reference/quixstreams.md b/docs/api-reference/quixstreams.md
index c4205ca2e..d9bb4c7a2 100644
--- a/docs/api-reference/quixstreams.md
+++ b/docs/api-reference/quixstreams.md
@@ -2,966 +2,894 @@
## quixstreams
-
-
-## quixstreams.core
-
-
-
-## quixstreams.core.stream
-
-
+
-## quixstreams.core.stream.stream
+## quixstreams.logging
-
+
-### Stream
+#### configure\_logging
```python
-class Stream()
+def configure_logging(loglevel: Optional[LogLevel]) -> bool
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/core/stream/stream.py#L22)
-
-
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/logging.py#L24)
-#### Stream.\_\_init\_\_
+Configure "quixstreams" logger.
-```python
-def __init__(func: Optional[StreamFunction] = None,
- parent: Optional[Self] = None)
-```
+>***NOTE:*** If "quixstreams" logger already has pre-defined handlers
+(e.g. logging has already been configured via `logging`, or the function
+is called twice), it will skip configuration and return `False`.
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/core/stream/stream.py#L23)
+**Arguments**:
-A base class for all streaming operations.
+- `loglevel`: a valid log level as a string or None.
+If None passed, this function is no-op and no logging will be configured.
-`Stream` is an abstraction of a function pipeline.
-Each Stream has a function and a parent (None by default).
-When adding new function to the stream, it creates a new `Stream` object and
-sets "parent" to the previous `Stream` to maintain an order of execution.
+**Returns**:
-Streams supports 3 types of functions:
-- "Apply" - generate new values based on a previous one.
- The result of an Apply function is passed downstream to the next functions.
- If "expand=True" is passed and the function returns an `Iterable`,
- each item of it will be treated as a separate value downstream.
-- "Update" - update values in-place.
- The result of an Update function is always ignored, and its input is passed
- downstream.
-- "Filter" - to filter values from the Stream.
- The result of a Filter function is interpreted as boolean.
- If it's `True`, the input will be passed downstream.
- If it's `False`, the `Filtered` exception will be raised to signal that the
- value is filtered out.
+True if logging config has been updated, otherwise False.
-To execute the functions on the `Stream`, call `.compose()` method, and
-it will return a closure to execute all the functions accumulated in the Stream
-and its parents.
+
-**Arguments**:
+## quixstreams.error\_callbacks
-- `func`: a function to be called on the stream.
-It is expected to be wrapped into one of "Apply", "Filter" or "Update" from
-`quixstreams.core.stream.functions` package.
-Default - "Apply(lambda v: v)".
-- `parent`: a parent `Stream`
+
-
+## quixstreams.platforms
-#### Stream.add\_filter
+
-```python
-def add_filter(func: Callable[[T], R]) -> Self
-```
+## quixstreams.platforms.quix.config
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/core/stream/stream.py#L79)
+
-Add a function to filter values from the Stream.
+### TopicCreationConfigs
-The return value of the function will be interpreted as `bool`.
-If the function returns `False`-like result, the Stream will raise `Filtered`
-exception during execution.
+```python
+@dataclasses.dataclass
+class TopicCreationConfigs()
+```
-**Arguments**:
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/config.py#L59)
-- `func`: a function to filter values from the stream
+
-**Returns**:
+#### name
-a new `Stream` derived from the current one
+Required when not created by a Quix App.
-
+
-#### Stream.add\_apply
+#### strip\_workspace\_id\_prefix
```python
-def add_apply(func: Callable[[T], R], expand: bool = False) -> Self
+def strip_workspace_id_prefix(workspace_id: str, s: str) -> str
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/core/stream/stream.py#L92)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/config.py#L68)
-Add an "apply" function to the Stream.
+Remove the workspace ID from a given string if it starts with it,
-The function is supposed to return a new value, which will be passed
-further during execution.
+typically a topic or consumer group id
**Arguments**:
-- `func`: a function to generate a new value
-- `expand`: if True, expand the returned iterable into individual values
-downstream. If returned value is not iterable, `TypeError` will be raised.
-Default - `False`.
+- `workspace_id`: the workspace id
+- `s`: the string to append to
**Returns**:
-a new `Stream` derived from the current one
+the string with workspace_id prefix removed
-
+
-#### Stream.add\_update
+#### prepend\_workspace\_id
```python
-def add_update(func: Callable[[T], object]) -> Self
+def prepend_workspace_id(workspace_id: str, s: str) -> str
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/core/stream/stream.py#L109)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/config.py#L80)
-Add an "update" function to the Stream, that will mutate the input value.
+Add the workspace ID as a prefix to a given string if it does not have it,
-The return of this function will be ignored and its input
-will be passed downstream.
+typically a topic or consumer group it
**Arguments**:
-- `func`: a function to mutate the value
+- `workspace_id`: the workspace id
+- `s`: the string to append to
**Returns**:
-a new Stream derived from the current one
+the string with workspace_id prepended
-
+
-#### Stream.diff
+### QuixKafkaConfigsBuilder
```python
-def diff(other: "Stream") -> Self
+class QuixKafkaConfigsBuilder()
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/core/stream/stream.py#L121)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/config.py#L92)
-Takes the difference between Streams `self` and `other` based on their last
+Retrieves all the necessary information from the Quix API and builds all the
+objects required to connect a confluent-kafka client to the Quix Platform.
-common parent, and returns a new `Stream` that includes only this difference.
+If not executed within the Quix platform directly, you must provide a Quix
+"streaming" (aka "sdk") token, or Personal Access Token.
-It's impossible to calculate a diff when:
- - Streams don't have a common parent.
- - When the `self` Stream already includes all the nodes from
- the `other` Stream, and the resulting diff is empty.
+Ideally you also know your workspace name or id. If not, you can search for it
+using a known topic name, but note the search space is limited to the access level
+of your token.
-**Arguments**:
+It also currently handles the app_auto_create_topics setting for Application.Quix.
-- `other`: a `Stream` to take a diff from.
+
-**Raises**:
+#### QuixKafkaConfigsBuilder.\_\_init\_\_
-- `ValueError`: if Streams don't have a common parent
-or if the diff is empty.
+```python
+def __init__(quix_sdk_token: Optional[str] = None,
+ workspace_id: Optional[str] = None,
+ workspace_cert_path: Optional[str] = None,
+ quix_portal_api_service: Optional[QuixPortalApiService] = None)
+```
-**Returns**:
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/config.py#L108)
-new `Stream` instance including all the Streams from the diff
+**Arguments**:
-
+- `quix_portal_api_service`: A QuixPortalApiService instance (else generated)
+- `workspace_id`: A valid Quix Workspace ID (else searched for)
+- `workspace_cert_path`: path to an existing workspace cert (else retrieved)
-#### Stream.tree
+
+
+#### QuixKafkaConfigsBuilder.strip\_workspace\_id\_prefix
```python
-def tree() -> List[Self]
+def strip_workspace_id_prefix(s: str) -> str
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/core/stream/stream.py#L150)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/config.py#L184)
-Return a list of all parent Streams including the node itself.
+Remove the workspace ID from a given string if it starts with it,
-The tree is ordered from child to parent (current node comes first).
+typically a topic or consumer group id
+
+**Arguments**:
+
+- `s`: the string to append to
**Returns**:
-a list of `Stream` objects
+the string with workspace_id prefix removed
-
+
-#### Stream.compose
+#### QuixKafkaConfigsBuilder.prepend\_workspace\_id
```python
-def compose(allow_filters: bool = True,
- allow_updates: bool = True,
- allow_expands: bool = True) -> Callable[[T], R]
+def prepend_workspace_id(s: str) -> str
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/core/stream/stream.py#L164)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/config.py#L194)
-Compose a list of functions from this `Stream` and its parents into one
-
-big closure using a "composer" function.
+Add the workspace ID as a prefix to a given string if it does not have it,
-Closures are more performant than calling all the functions in the
-`Stream.tree()` one-by-one.
+typically a topic or consumer group it
**Arguments**:
-- `allow_filters`: If False, this function will fail with `ValueError` if
-the stream has filter functions in the tree. Default - True.
-- `allow_updates`: If False, this function will fail with `ValueError` if
-the stream has update functions in the tree. Default - True.
-- `allow_expands`: If False, this function will fail with `ValueError` if
-the stream has functions with "expand=True" in the tree. Default - True.
-
-**Raises**:
-
-- `ValueError`: if disallowed functions are present in the stream tree.
+- `s`: the string to append to
-
+**Returns**:
-## quixstreams.core.stream.functions
+the string with workspace_id prepended
-
+
-### StreamFunction
+#### QuixKafkaConfigsBuilder.search\_for\_workspace
```python
-class StreamFunction(abc.ABC)
+def search_for_workspace(
+ workspace_name_or_id: Optional[str] = None) -> Optional[dict]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/core/stream/functions.py#L26)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/config.py#L204)
-A base class for all the streaming operations in Quix Streams.
+Search for a workspace given an expected workspace name or id.
-It provides two methods that return closures to be called on the input values:
-- `get_executor` - a wrapper to execute on a single value
-- `get_executor_expanded` - a wrapper to execute on an expanded value.
- Expanded value is a list, where each item should be treated as a separate value.
+**Arguments**:
-
+- `workspace_name_or_id`: the expected name or id of a workspace
-#### StreamFunction.func
+**Returns**:
+
+the workspace data dict if search success, else None
+
+
+
+#### QuixKafkaConfigsBuilder.get\_workspace\_info
```python
-@property
-def func() -> StreamCallable
+def get_workspace_info(known_workspace_topic: Optional[str] = None)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/core/stream/functions.py#L43)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/config.py#L227)
-The original function
+Queries for workspace data from the Quix API, regardless of instance cache,
-
+and updates instance attributes from query result.
-#### StreamFunction.get\_executor
+**Arguments**:
+
+- `known_workspace_topic`: a topic you know to exist in some workspace
+
+
+
+#### QuixKafkaConfigsBuilder.search\_workspace\_for\_topic
```python
-@abc.abstractmethod
-def get_executor() -> StreamCallable
+def search_workspace_for_topic(workspace_id: str, topic: str) -> Optional[str]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/core/stream/functions.py#L50)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/config.py#L254)
-Returns a wrapper to be called on a single value.
+Search through all the topics in the given workspace id to see if there is a
-
+match with the provided topic.
-#### StreamFunction.get\_executor\_expanded
+**Arguments**:
-```python
-@abc.abstractmethod
-def get_executor_expanded() -> StreamCallable
-```
+- `workspace_id`: the workspace to search in
+- `topic`: the topic to search for
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/core/stream/functions.py#L56)
+**Returns**:
-Returns a wrapper to be called on a list of expanded values.
+the workspace_id if success, else None
-
+
-### ApplyFunction
+#### QuixKafkaConfigsBuilder.search\_for\_topic\_workspace
```python
-class ApplyFunction(StreamFunction)
+def search_for_topic_workspace(topic: str) -> Optional[dict]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/core/stream/functions.py#L62)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/config.py#L270)
-Wrap a function into "Apply" function.
+Find what workspace a topic belongs to.
-The provided function is expected to return a new value based on input,
-and its result will always be passed downstream.
+If there is only one workspace altogether, it is assumed to be the workspace.
+More than one means each workspace will be searched until the first hit.
-
+**Arguments**:
-### ApplyExpandFunction
+- `topic`: the topic to search for
+
+**Returns**:
+
+workspace data dict if topic search success, else None
+
+
+
+#### QuixKafkaConfigsBuilder.get\_workspace\_ssl\_cert
```python
-class ApplyExpandFunction(StreamFunction)
+def get_workspace_ssl_cert(
+ extract_to_folder: Optional[Path] = None) -> Optional[str]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/core/stream/functions.py#L85)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/config.py#L291)
-Wrap a function into "Apply" function and expand the returned iterable
-into separate values downstream.
+Gets and extracts zipped certificate from the API to provided folder if the
-The provided function is expected to return an `Iterable`.
-If the returned value is not `Iterable`, `TypeError` will be raised.
+SSL certificate is specified in broker configuration.
-
+If no path was provided, will dump to /tmp. Expects cert named 'ca.cert'.
-### FilterFunction
+**Arguments**:
-```python
-class FilterFunction(StreamFunction)
-```
+- `extract_to_folder`: path to folder to dump zipped cert file to
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/core/stream/functions.py#L114)
+**Returns**:
-Wraps a function into a "Filter" function.
-The result of a Filter function is interpreted as boolean.
-If it's `True`, the input will be return downstream.
-If it's `False`, the `Filtered` exception will be raised to signal that the
-value is filtered out.
+full cert filepath as string or `None` if certificate is not specified
-
+
-### UpdateFunction
+#### QuixKafkaConfigsBuilder.create\_topics
```python
-class UpdateFunction(StreamFunction)
+def create_topics(topics: List[Topic],
+ finalize_timeout_seconds: Optional[int] = None)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/core/stream/functions.py#L146)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/config.py#L375)
-Wrap a function into an "Update" function.
+Create topics in a Quix cluster.
-The provided function is expected to mutate the value
-or to perform some side effect.
-Its result will always be ignored, and its input is passed
-downstream.
+**Arguments**:
-
+- `topics`: a list of `Topic` objects
+- `finalize_timeout_seconds`: How long to wait for the topics to be
+marked as "Ready" (and thus ready to produce to/consume from).
-#### compose
+
+
+#### QuixKafkaConfigsBuilder.get\_topic
```python
-def compose(functions: List[StreamFunction],
- allow_filters: bool = True,
- allow_updates: bool = True,
- allow_expands: bool = True) -> StreamCallable
+def get_topic(topic_name: str) -> Optional[dict]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/core/stream/functions.py#L175)
-
-Composes a list of functions and its parents into a single
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/config.py#L421)
-big closure like this:
-```
-[func, func, func] -> func(func(func()))
-```
+return the topic ID (the actual cluster topic name) if it exists, else None
-Closures are more performant than calling all functions one by one in a loop.
+>***NOTE***: if the name registered in Quix is instead the workspace-prefixed
+version, this returns None unless that exact name was created WITHOUT the
+Quix API.
**Arguments**:
-- `functions`: list of `StreamFunction` objects to compose
-- `allow_filters`: If False, will fail with `ValueError` if
-the list has `FilterFunction`. Default - True.
-- `allow_updates`: If False, will fail with `ValueError` if
-the list has `UpdateFunction`. Default - True.
-- `allow_expands`: If False, will fail with `ValueError` if
-the list has `ApplyFunction` with "expand=True". Default - True.
+- `topic_name`: name of the topic
-**Raises**:
+**Returns**:
-- `ValueError`: if disallowed functions are present in the list of functions.
+response dict of the topic info if topic found, else None
-
+
-#### composer
+#### QuixKafkaConfigsBuilder.confirm\_topics\_exist
```python
-def composer(outer_func: StreamCallable,
- inner_func: StreamCallable) -> Callable[[T], R]
+def confirm_topics_exist(topics: Union[List[Topic], List[str]])
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/core/stream/functions.py#L227)
-
-A function that wraps two other functions into a closure.
-
-It passes the result of the inner function as an input to the outer function.
-
-**Returns**:
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/config.py#L443)
-a function with one argument (value)
+Confirm whether the desired set of topics exists in the Quix workspace.
-
+**Arguments**:
-## quixstreams.dataframe.utils
+- `topics`: a list of `Topic` or topic names
-
+
-#### ensure\_milliseconds
+#### QuixKafkaConfigsBuilder.get\_confluent\_broker\_config
```python
-def ensure_milliseconds(delta: Union[int, timedelta]) -> int
+def get_confluent_broker_config(known_topic: Optional[str] = None) -> dict
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/utils.py#L5)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/config.py#L483)
-Convert timedelta to milliseconds.
+Get the full client config dictionary required to authenticate a confluent-kafka
-If the `delta` is not
-This function will also round the value to the closest milliseconds in case of
-higher precision.
+client to a Quix platform broker/workspace.
+
+The returned config can be used directly by any confluent-kafka-python consumer/
+producer (add your producer/consumer-specific configs afterward).
**Arguments**:
-- `delta`: `timedelta` object
+- `known_topic`: a topic known to exist in some workspace
**Returns**:
-timedelta value in milliseconds as `int`
-
-
+a dict of confluent-kafka-python client settings (see librdkafka
+config for more details)
-## quixstreams.dataframe.windows
+
-
+#### QuixKafkaConfigsBuilder.get\_confluent\_client\_configs
-## quixstreams.dataframe.windows.base
+```python
+def get_confluent_client_configs(
+ topics: list,
+ consumer_group_id: Optional[str] = None
+) -> Tuple[dict, List[str], Optional[str]]
+```
-
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/config.py#L528)
-#### get\_window\_ranges
+Get all the values you need in order to use a confluent_kafka-based client
-```python
-def get_window_ranges(timestamp_ms: int,
- duration_ms: int,
- step_ms: Optional[int] = None) -> List[Tuple[int, int]]
-```
+with a topic on a Quix platform broker/workspace.
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/windows/base.py#L22)
+The returned config can be used directly by any confluent-kafka-python consumer/
+producer (add your producer/consumer-specific configs afterward).
-Get a list of window ranges for the given timestamp.
+The topics and consumer group are appended with any necessary values.
**Arguments**:
-- `timestamp_ms`: timestamp in milliseconds
-- `duration_ms`: window duration in milliseconds
-- `step_ms`: window step in milliseconds for hopping windows, optional.
+- `topics`: list of topics
+- `consumer_group_id`: consumer group id, if needed
**Returns**:
-a list of (, ) tuples
+a tuple with configs and altered versions of the topics
+and consumer group name
-
+
-## quixstreams.dataframe.windows.time\_based
+## quixstreams.platforms.quix.env
-
+
-### FixedTimeWindow
+### QuixEnvironment
```python
-class FixedTimeWindow()
+class QuixEnvironment()
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/windows/time_based.py#L32)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/env.py#L7)
-
+Class to access various Quix platform environment settings
-#### FixedTimeWindow.final
+
+
+#### QuixEnvironment.state\_management\_enabled
```python
-def final(expand: bool = True) -> "StreamingDataFrame"
+@property
+def state_management_enabled() -> bool
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/windows/time_based.py#L95)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/env.py#L19)
-Apply the window aggregation and return results only when the windows are
+Check whether "State management" is enabled for the current deployment
-closed.
+**Returns**:
+
+True if state management is enabled, otherwise False
+
+
+
+#### QuixEnvironment.deployment\_id
-The format of returned windows:
```python
-{
- "start": ,
- "end": ,
- "value: ,
-}
+@property
+def deployment_id() -> Optional[str]
```
-The individual window is closed when the event time
-(the maximum observed timestamp across the partition) passes
-its end timestamp + grace period.
-The closed windows cannot receive updates anymore and are considered final.
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/env.py#L27)
->***NOTE:*** Windows can be closed only within the same message key.
-If some message keys appear irregularly in the stream, the latest windows
-can remain unprocessed until the message the same key is received.
+Return current Quix deployment id.
-**Arguments**:
+This variable is meant to be set only by Quix Platform and only
+when the application is deployed.
-- `expand`: if `True`, each window result will be sent downstream as
-an individual item. Otherwise, the list of window results will be sent.
-Default - `True`
+**Returns**:
-
+deployment id or None
-#### FixedTimeWindow.current
+
+
+#### QuixEnvironment.workspace\_id
```python
-def current(expand: bool = True) -> "StreamingDataFrame"
+@property
+def workspace_id() -> Optional[str]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/windows/time_based.py#L132)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/env.py#L39)
-Apply the window transformation to the StreamingDataFrame to return results
+Return Quix workspace id if set
-for each updated window.
+**Returns**:
+
+workspace id or None
+
+
+
+#### QuixEnvironment.portal\_api
-The format of returned windows:
```python
-{
- "start": ,
- "end": ,
- "value: ,
-}
+@property
+def portal_api() -> Optional[str]
```
-This method processes streaming data and returns results as they come,
-regardless of whether the window is closed or not.
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/env.py#L47)
-**Arguments**:
+Return Quix Portal API url if set
-- `expand`: if `True`, each window result will be sent downstream as
-an individual item. Otherwise, the list of window results will be sent.
-Default - `True`
-
-
+**Returns**:
-## quixstreams.dataframe.windows.definitions
+portal API URL or None
-
+
-### FixedTimeWindowDefinition
+#### QuixEnvironment.state\_dir
```python
-class FixedTimeWindowDefinition(abc.ABC)
+@property
+def state_dir() -> str
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/windows/definitions.py#L20)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/env.py#L56)
-
+Return application state directory on Quix.
-#### FixedTimeWindowDefinition.sum
+**Returns**:
-```python
-def sum() -> "FixedTimeWindow"
-```
+path to state dir
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/windows/definitions.py#L67)
+
-Configure the window to aggregate data by summing up values within
+## quixstreams.platforms.quix.checks
-each window period.
+
-**Returns**:
+#### check\_state\_management\_enabled
-an instance of `FixedTimeWindow` configured to perform sum aggregation.
+```python
+def check_state_management_enabled()
+```
-
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/checks.py#L11)
-#### FixedTimeWindowDefinition.count
+Check if State Management feature is enabled for the current deployment on
+Quix platform.
+If it's disabled, the exception will be raised.
+
+
+
+#### check\_state\_dir
```python
-def count() -> "FixedTimeWindow"
+def check_state_dir(state_dir: str)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/windows/definitions.py#L94)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/checks.py#L28)
-Configure the window to aggregate data by counting the number of values
+Check if Application "state_dir" matches the state dir on Quix platform.
-within each window period.
+If it doesn't match, the warning will be logged.
-**Returns**:
+**Arguments**:
-an instance of `FixedTimeWindow` configured to perform record count.
+- `state_dir`: application state_dir path
-
+
-#### FixedTimeWindowDefinition.mean
+## quixstreams.platforms.quix
+
+
+
+## quixstreams.platforms.quix.api
+
+
+
+### QuixPortalApiService
```python
-def mean() -> "FixedTimeWindow"
+class QuixPortalApiService()
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/windows/definitions.py#L121)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/api.py#L19)
-Configure the window to aggregate data by calculating the mean of the values
+A light wrapper around the Quix Portal Api. If used in the Quix Platform, it will
+use that workspaces auth token and portal endpoint, else you must provide it.
-within each window period.
+Function names closely reflect the respective API endpoint,
+each starting with the method [GET, POST, etc.] followed by the endpoint path.
-**Returns**:
+Results will be returned in the form of request's Response.json(), unless something
+else is required. Non-200's will raise exceptions.
-an instance of `FixedTimeWindow` configured to calculate the mean
-of the values.
+See the swagger documentation for more info about the endpoints.
-
+
-#### FixedTimeWindowDefinition.reduce
+#### QuixPortalApiService.get\_workspace\_certificate
```python
-def reduce(reducer: Callable[[Any, Any], Any],
- initializer: Callable[[Any], Any]) -> "FixedTimeWindow"
+def get_workspace_certificate(
+ workspace_id: Optional[str] = None) -> Optional[bytes]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/windows/definitions.py#L152)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/api.py#L112)
-Configure the window to perform a custom aggregation using `reducer`
+Get a workspace TLS certificate if available.
-and `initializer` functions.
+Returns `None` if certificate is not specified.
-Example Snippet:
-```python
-sdf = StreamingDataFrame(...)
+**Arguments**:
-# Using "reduce()" to calculate multiple aggregates at once
-def reducer(agg: dict, current: int):
- aggregated = {
- 'min': min(agg['min'], current),
- 'max': max(agg['max'], current)
- 'count': agg['count'] + 1
- }
- return aggregated
+- `workspace_id`: workspace id, optional
-def initializer(current) -> dict:
- return {'min': current, 'max': current, 'count': 1}
+**Returns**:
-window = (
- sdf.tumbling_window(duration_ms=1000)
- .reduce(reducer=reducer, initializer=initializer)
- .final()
-)
-```
+certificate as bytes if present, or None
-**Arguments**:
+
-- `reducer`: A function that takes two arguments
-(the accumulated value and a new value) and returns a single value.
-The returned value will be saved to the state store and sent downstream.
-- `initializer`: A function to call for every first element of the window.
-This function is used to initialize the aggregation within a window.
+## quixstreams.platforms.quix.exceptions
-**Returns**:
+
-A window configured to perform custom reduce aggregation on the data.
+## quixstreams.platforms.quix.topic\_manager
-
+
-#### FixedTimeWindowDefinition.max
+### QuixTopicManager
```python
-def max() -> "FixedTimeWindow"
+class QuixTopicManager(TopicManager)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/windows/definitions.py#L212)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/topic_manager.py#L9)
-Configure a window to aggregate the maximum value within each window period.
+The source of all topic management with quixstreams.
-**Returns**:
+This is specifically for Applications using the Quix platform.
-an instance of `FixedTimeWindow` configured to calculate the maximum
-value within each window period.
+Generally initialized and managed automatically by an `Application.Quix`,
+but allows a user to work with it directly when needed, such as using it alongside
+a plain `Producer` to create its topics.
-
+See methods for details.
-#### FixedTimeWindowDefinition.min
+
+
+#### QuixTopicManager.\_\_init\_\_
```python
-def min() -> "FixedTimeWindow"
+def __init__(topic_admin: TopicAdmin,
+ quix_config_builder: QuixKafkaConfigsBuilder,
+ create_timeout: int = 60)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/windows/definitions.py#L241)
-
-Configure a window to aggregate the minimum value within each window period.
-
-**Returns**:
-
-an instance of `FixedTimeWindow` configured to calculate the maximum
-value within each window period.
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/topic_manager.py#L30)
-
+**Arguments**:
-## quixstreams.dataframe
+- `topic_admin`: an `Admin` instance
+- `create_timeout`: timeout for topic creation
+- `quix_config_builder`: A QuixKafkaConfigsBuilder instance, else one is
+generated for you.
-
+
-## quixstreams.dataframe.series
+## quixstreams.dataframe.dataframe
-
+
-### StreamingSeries
+### StreamingDataFrame
```python
-class StreamingSeries(BaseStreaming)
+class StreamingDataFrame(BaseStreaming)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/series.py#L17)
-
-`StreamingSeries` are typically generated by `StreamingDataframes` when getting
-elements from, or performing certain operations on, a `StreamingDataframe`,
-thus acting as a representation of "column" value.
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/dataframe.py#L31)
-They share some operations with the `StreamingDataframe`, but also provide some
-additional functionality.
+`StreamingDataFrame` is the main object you will use for ETL work.
-Most column value operations are handled by this class, and `StreamingSeries` can
-generate other `StreamingSeries` as a result of said operations.
+Typically created with an `app = quixstreams.app.Application()` instance,
+via `sdf = app.dataframe()`.
What it Does:
-- Allows ways to do simple operations with dataframe "column"/dictionary values:
- - Basic ops like add, subtract, modulo, etc.
-- Enables comparisons/inequalities:
- - Greater than, equals, etc.
- - and/or, is/not operations
-- Can check for existence of columns in `StreamingDataFrames`
-- Enables chaining of various operations together
+- Builds a data processing pipeline, declaratively (not executed immediately)
+ - Executes this pipeline on inputs at runtime (Kafka message values)
+- Provides functions/interface similar to Pandas Dataframes/Series
+- Enables stateful processing (and manages everything related to it)
How to Use:
-For the most part, you may not even notice this class exists!
-They will naturally be created as a result of typical `StreamingDataFrame` use.
+Define various operations while continuously reassigning to itself (or new fields).
-Auto-complete should help you with valid methods and type-checking should alert
-you to invalid operations between `StreamingSeries`.
+These operations will generally transform your data, access/update state, or produce
+to kafka topics.
-In general, any typical Pands dataframe operation between columns should be valid
-with `StreamingSeries`, and you shouldn't have to think about them explicitly.
+We recommend your data structure to be "columnar" (aka a dict/JSON) in nature so
+that it works with the entire interface, but simple types like `ints`, `str`, etc.
+are also supported.
+
+See the various methods and classes for more specifics, or for a deep dive into
+usage, see `streamingdataframe.md` under the `docs/` folder.
+
+>***NOTE:*** column referencing like `sdf["a_column"]` and various methods often
+ create other object types (typically `quixstreams.dataframe.StreamingSeries`),
+ which is expected; type hinting should alert you to any issues should you
+ attempt invalid operations with said objects (however, we cannot infer whether
+ an operation is valid with respect to your data!).
Example Snippet:
```python
-# Random methods for example purposes. More detailed explanations found under
-# various methods or in the docs folder.
-
sdf = StreamingDataframe()
-sdf = sdf["column_a"].apply(a_func).apply(diff_func, stateful=True)
-sdf["my_new_bool_field"] = sdf["column_b"].contains("this_string")
-sdf["new_sum_field"] = sdf["column_c"] + sdf["column_d"] + 2
-sdf = sdf[["column_a"] & (sdf["new_sum_field"] >= 10)]
+sdf = sdf.apply(a_func)
+sdf = sdf.filter(another_func)
+sdf = sdf.to_topic(topic_obj)
```
-
+
-#### StreamingSeries.from\_func
+#### StreamingDataFrame.apply
```python
-@classmethod
-def from_func(cls, func: StreamCallable) -> Self
+def apply(func: Union[DataFrameFunc, DataFrameStatefulFunc],
+ stateful: bool = False,
+ expand: bool = False) -> Self
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/series.py#L77)
-
-Create a StreamingSeries from a function.
-
-The provided function will be wrapped into `Apply`
-
-**Arguments**:
-
-- `func`: a function to apply
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/dataframe.py#L108)
-**Returns**:
+Apply a function to transform the value and return a new value.
-instance of `StreamingSeries`
+The result will be passed downstream as an input value.
-
-#### StreamingSeries.apply
+Example Snippet:
```python
-def apply(func: StreamCallable) -> Self
-```
-
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/series.py#L91)
-
-Add a callable to the execution list for this series.
-
-The provided callable should accept a single argument, which will be its input.
-The provided callable should similarly return one output, or None
-
-They can be chained together or included with other operations.
-
-
-Example Snippet:
-
-```python
-# The `StreamingSeries` are generated when `sdf["COLUMN_NAME"]` is called.
-# This stores a string in state and capitalizes the column value; the result is
-# assigned to a new column.
-# Another apply converts a str column to an int, assigning it to a new column.
-
-def func(value: str, state: State):
+# This stores a string in state and capitalizes every column with a string value.
+# A second apply then keeps only the string value columns (shows non-stateful).
+def func(d: dict, state: State):
+ value = d["store_field"]
if value != state.get("my_store_key"):
state.set("my_store_key") = value
- return v.upper()
+ return {k: v.upper() if isinstance(v, str) else v for k, v in d.items()}
sdf = StreamingDataframe()
-sdf["new_col"] = sdf["a_column"]["nested_dict_key"].apply(func, stateful=True)
-sdf["new_col_2"] = sdf["str_col"].apply(lambda v: int(v)) + sdf["str_col2"] + 2
+sdf = sdf.apply(func, stateful=True)
+sdf = sdf.apply(lambda d: {k: v for k,v in d.items() if isinstance(v, str)})
+
```
**Arguments**:
-- `func`: a callable with one argument and one output
-
-**Returns**:
-
-a new `StreamingSeries` with the new callable added
+- `func`: a function to apply
+- `stateful`: if `True`, the function will be provided with a second argument
+of type `State` to perform stateful operations.
+- `expand`: if True, expand the returned iterable into individual values
+downstream. If returned value is not iterable, `TypeError` will be raised.
+Default - `False`.
-
+
-#### StreamingSeries.compose
+#### StreamingDataFrame.update
```python
-def compose(allow_filters: bool = True,
- allow_updates: bool = True) -> StreamCallable
+def update(func: Union[DataFrameFunc, DataFrameStatefulFunc],
+ stateful: bool = False) -> Self
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/series.py#L125)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/dataframe.py#L151)
-Compose all functions of this StreamingSeries into one big closure.
+Apply a function to mutate value in-place or to perform a side effect
-Closures are more performant than calling all the functions in the
-`StreamingDataFrame` one-by-one.
+that doesn't update the value (e.g. print a value to the console).
-Generally not required by users; the `quixstreams.app.Application` class will
-do this automatically.
+The result of the function will be ignored, and the original value will be
+passed downstream.
Example Snippet:
```python
-from quixstreams import Application
-
-app = Application(...)
+# Stores a value and mutates a list by appending a new item to it.
+# Also prints to console.
-sdf = app.dataframe()
-sdf = sdf["column_a"].apply(apply_func)
-sdf = sdf["column_b"].contains(filter_func)
-sdf = sdf.compose()
+def func(values: list, state: State):
+ value = values[0]
+ if value != state.get("my_store_key"):
+ state.set("my_store_key") = value
+ values.append("new_item")
-result_0 = sdf({"my": "record"})
-result_1 = sdf({"other": "record"})
+sdf = StreamingDataframe()
+sdf = sdf.update(func, stateful=True)
+sdf = sdf.update(lambda value: print("Received value: ", value))
```
**Arguments**:
-- `allow_filters`: If False, this function will fail with ValueError if
-the stream has filter functions in the tree. Default - True.
-- `allow_updates`: If False, this function will fail with ValueError if
-the stream has update functions in the tree. Default - True.
-
-**Raises**:
-
-- `ValueError`: if disallowed functions are present in the tree of
-underlying `Stream`.
-
-**Returns**:
-
-a function that accepts "value"
-and returns a result of `StreamingSeries`
+- `func`: function to update value
+- `stateful`: if `True`, the function will be provided with a second argument
+of type `State` to perform stateful operations.
-
+
-#### StreamingSeries.test
+#### StreamingDataFrame.filter
```python
-def test(value: Any, ctx: Optional[MessageContext] = None) -> Any
+def filter(func: Union[DataFrameFunc, DataFrameStatefulFunc],
+ stateful: bool = False) -> Self
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/series.py#L172)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/dataframe.py#L190)
-A shorthand to test `StreamingSeries` with provided value
+Filter value using provided function.
-and `MessageContext`.
+If the function returns True-like value, the original value will be
+passed downstream.
+Otherwise, the `Filtered` exception will be raised (further processing for that
+message will be skipped).
-**Arguments**:
-- `value`: value to pass through `StreamingSeries`
-- `ctx`: instance of `MessageContext`, optional.
-Provide it if the StreamingSeries instance has
-functions calling `get_current_key()`.
-Default - `None`.
+Example Snippet:
-**Returns**:
+```python
+# Stores a value and allows further processing only if the value is greater than
+# what was previously stored.
-result of `StreamingSeries`
+def func(d: dict, state: State):
+ value = d["my_value"]
+ if value > state.get("my_store_key"):
+ state.set("my_store_key") = value
+ return True
+ return False
-
+sdf = StreamingDataframe()
+sdf = sdf.filter(func, stateful=True)
+```
-#### StreamingSeries.isin
+**Arguments**:
-```python
-def isin(other: Container) -> Self
-```
+- `func`: function to filter value
+- `stateful`: if `True`, the function will be provided with second argument
+of type `State` to perform stateful operations.
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/series.py#L208)
+
-Check if series value is in "other".
+#### StreamingDataFrame.contains
-Same as "StreamingSeries in other".
+```python
+@staticmethod
+def contains(key: str) -> StreamingSeries
+```
-Runtime result will be a `bool`.
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/dataframe.py#L233)
+Check if the key is present in the Row value.
Example Snippet:
```python
-from quixstreams import Application
-
-# Check if "str_column" is contained in a column with a list of strings and
-# assign the resulting `bool` to a new column: "has_my_str".
+# Add new column 'has_column' which contains a boolean indicating
+# the presence of 'column_x'
-sdf = app.dataframe()
-sdf["has_my_str"] = sdf["str_column"].isin(sdf["column_with_list_of_strs"])
+sdf = StreamingDataframe()
+sdf['has_column'] = sdf.contains('column_x')
```
**Arguments**:
-- `other`: a container to check
+- `key`: a column name to check.
**Returns**:
-new StreamingSeries
+a Column object that evaluates to True if the key is present
+or False otherwise.
-
+
-#### StreamingSeries.contains
+#### StreamingDataFrame.to\_topic
```python
-def contains(other: Union[Self, object]) -> Self
+def to_topic(topic: Topic,
+ key: Optional[Callable[[object], object]] = None) -> Self
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/series.py#L235)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/dataframe.py#L256)
-Check if series value contains "other"
-
-Same as "other in StreamingSeries".
+Produce current value to a topic. You can optionally specify a new key.
-Runtime result will be a `bool`.
+>***NOTE:*** A `RowProducer` instance must be assigned to
+`StreamingDataFrame.producer` if not using :class:`quixstreams.app.Application`
+ to facilitate the execution of StreamingDataFrame.
Example Snippet:
@@ -969,421 +897,471 @@ Example Snippet:
```python
from quixstreams import Application
-# Check if "column_a" contains "my_substring" and assign the resulting
-# `bool` to a new column: "has_my_substr"
+# Produce to two different topics, changing the key for one of them.
-sdf = app.dataframe()
-sdf["has_my_substr"] = sdf["column_a"].contains("my_substring")
+app = Application()
+input_topic = app.topic("input_x")
+output_topic_0 = app.topic("output_a")
+output_topic_1 = app.topic("output_b")
+
+sdf = app.dataframe(input_topic)
+sdf = sdf.to_topic(output_topic_0)
+sdf = sdf.to_topic(output_topic_1, key=lambda data: data["a_field"])
```
**Arguments**:
-- `other`: object to check
-
-**Returns**:
-
-new StreamingSeries
+- `topic`: instance of `Topic`
+- `key`: a callable to generate a new message key, optional.
+If passed, the return type of this callable must be serializable
+by `key_serializer` defined for this Topic object.
+By default, the current message key will be used.
-
+
-#### StreamingSeries.is\_
+#### StreamingDataFrame.compose
```python
-def is_(other: Union[Self, object]) -> Self
+def compose() -> StreamCallable
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/series.py#L260)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/dataframe.py#L295)
-Check if series value refers to the same object as `other`
+Compose all functions of this StreamingDataFrame into one big closure.
-Runtime result will be a `bool`.
+Closures are more performant than calling all the functions in the
+`StreamingDataFrame` one-by-one.
+
+Generally not required by users; the `quixstreams.app.Application` class will
+do this automatically.
Example Snippet:
```python
-# Check if "column_a" is the same as "column_b" and assign the resulting `bool`
-# to a new column: "is_same"
-
from quixstreams import Application
sdf = app.dataframe()
-sdf["is_same"] = sdf["column_a"].is_(sdf["column_b"])
-```
-
-**Arguments**:
+sdf = sdf.apply(apply_func)
+sdf = sdf.filter(filter_func)
+sdf = sdf.compose()
-- `other`: object to check for "is"
+result_0 = sdf({"my": "record"})
+result_1 = sdf({"other": "record"})
+```
**Returns**:
-new StreamingSeries
+a function that accepts "value"
+and returns a result of StreamingDataFrame
-
+
-#### StreamingSeries.isnot
+#### StreamingDataFrame.test
```python
-def isnot(other: Union[Self, object]) -> Self
+def test(value: object, ctx: Optional[MessageContext] = None) -> Any
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/series.py#L283)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/dataframe.py#L325)
-Check if series value does not refer to the same object as `other`
+A shorthand to test `StreamingDataFrame` with provided value
-Runtime result will be a `bool`.
+and `MessageContext`.
+**Arguments**:
-Example Snippet:
+- `value`: value to pass through `StreamingDataFrame`
+- `ctx`: instance of `MessageContext`, optional.
+Provide it if the StreamingDataFrame instance calls `to_topic()`,
+has stateful functions or functions calling `get_current_key()`.
+Default - `None`.
-```python
-from quixstreams import Application
+**Returns**:
-# Check if "column_a" is the same as "column_b" and assign the resulting `bool`
-# to a new column: "is_not_same"
+result of `StreamingDataFrame`
-sdf = app.dataframe()
-sdf["is_not_same"] = sdf["column_a"].isnot(sdf["column_b"])
-```
-
-**Arguments**:
-
-- `other`: object to check for "is_not"
-
-**Returns**:
-
-new StreamingSeries
-
-
+
-#### StreamingSeries.isnull
+#### StreamingDataFrame.tumbling\_window
```python
-def isnull() -> Self
+def tumbling_window(duration_ms: Union[int, timedelta],
+ grace_ms: Union[int, timedelta] = 0,
+ name: Optional[str] = None) -> TumblingWindowDefinition
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/series.py#L307)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/dataframe.py#L343)
-Check if series value is None.
+Create a tumbling window transformation on this StreamingDataFrame.
-Runtime result will be a `bool`.
+Tumbling windows divide time into fixed-sized, non-overlapping windows.
+They allow to perform stateful aggregations like `sum`, `reduce`, etc.
+on top of the data and emit results downstream.
-Example Snippet:
+Notes:
-```python
-from quixstreams import Application
+- Every window is grouped by the current Kafka message key.
+- Messages with `None` key will be ignored.
+- The time windows always use the current event time.
-# Check if "column_a" is null and assign the resulting `bool` to a new column:
-# "is_null"
-sdf = app.dataframe()
-sdf["is_null"] = sdf["column_a"].isnull()
-```
-**Returns**:
+Example Snippet:
-new StreamingSeries
+```python
+app = Application()
+sdf = app.dataframe(...)
-
+sdf = (
+ # Define a tumbling window of 60s and grace period of 10s
+ sdf.tumbling_window(
+ duration_ms=timedelta(seconds=60), grace_ms=timedelta(seconds=10.0)
+ )
-#### StreamingSeries.notnull
+ # Specify the aggregation function
+ .sum()
-```python
-def notnull() -> Self
+ # Specify how the results should be emitted downstream.
+ # "all()" will emit results as they come for each updated window,
+ # possibly producing multiple messages per key-window pair
+ # "final()" will emit windows only when they are closed and cannot
+ # receive any updates anymore.
+ .all()
+)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/series.py#L330)
-
-Check if series value is not None.
+**Arguments**:
-Runtime result will be a `bool`.
+- `duration_ms`: The length of each window.
+Can be specified as either an `int` representing milliseconds or a
+`timedelta` object.
+>***NOTE:*** `timedelta` objects will be rounded to the closest millisecond
+value.
+- `grace_ms`: The grace period for data arrival.
+It allows late-arriving data (data arriving after the window
+has theoretically closed) to be included in the window.
+Can be specified as either an `int` representing milliseconds
+or as a `timedelta` object.
+>***NOTE:*** `timedelta` objects will be rounded to the closest millisecond
+value.
+- `name`: The unique identifier for the window. If not provided, it will be
+automatically generated based on the window's properties.
+**Returns**:
-Example Snippet:
+`TumblingWindowDefinition` instance representing the tumbling window
+configuration.
+This object can be further configured with aggregation functions
+like `sum`, `count`, etc. applied to the StreamingDataFrame.
-```python
-from quixstreams import Application
+
-# Check if "column_a" is not null and assign the resulting `bool` to a new column:
-# "is_not_null"
+#### StreamingDataFrame.hopping\_window
-sdf = app.dataframe()
-sdf["is_not_null"] = sdf["column_a"].notnull()
+```python
+def hopping_window(duration_ms: Union[int, timedelta],
+ step_ms: Union[int, timedelta],
+ grace_ms: Union[int, timedelta] = 0,
+ name: Optional[str] = None) -> HoppingWindowDefinition
```
-**Returns**:
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/dataframe.py#L418)
-new StreamingSeries
+Create a hopping window transformation on this StreamingDataFrame.
-
+Hopping windows divide the data stream into overlapping windows based on time.
+The overlap is controlled by the `step_ms` parameter.
-#### StreamingSeries.abs
+They allow to perform stateful aggregations like `sum`, `reduce`, etc.
+on top of the data and emit results downstream.
-```python
-def abs() -> Self
-```
+Notes:
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/series.py#L353)
+- Every window is grouped by the current Kafka message key.
+- Messages with `None` key will be ignored.
+- The time windows always use the current event time.
-Get absolute value of the series value.
Example Snippet:
```python
-from quixstreams import Application
-
-# Get absolute value of "int_col" and add it to "other_int_col".
-# Finally, assign the result to a new column: "abs_col_sum".
+app = Application()
+sdf = app.dataframe(...)
-sdf = app.dataframe()
-sdf["abs_col_sum"] = sdf["int_col"].abs() + sdf["other_int_col"]
-```
+sdf = (
+ # Define a hopping window of 60s with step 30s and grace period of 10s
+ sdf.hopping_window(
+ duration_ms=timedelta(seconds=60),
+ step_ms=timedelta(seconds=30),
+ grace_ms=timedelta(seconds=10)
+ )
-**Returns**:
+ # Specify the aggregation function
+ .sum()
-new StreamingSeries
+ # Specify how the results should be emitted downstream.
+ # "all()" will emit results as they come for each updated window,
+ # possibly producing multiple messages per key-window pair
+ # "final()" will emit windows only when they are closed and cannot
+ # receive any updates anymore.
+ .all()
+)
+```
-
+**Arguments**:
-## quixstreams.dataframe.base
+- `duration_ms`: The length of each window. It defines the time span for
+which each window aggregates data.
+Can be specified as either an `int` representing milliseconds
+or a `timedelta` object.
+>***NOTE:*** `timedelta` objects will be rounded to the closest millisecond
+value.
+- `step_ms`: The step size for the window.
+It determines how much each successive window moves forward in time.
+Can be specified as either an `int` representing milliseconds
+or a `timedelta` object.
+>***NOTE:*** `timedelta` objects will be rounded to the closest millisecond
+value.
+- `grace_ms`: The grace period for data arrival.
+It allows late-arriving data to be included in the window,
+even if it arrives after the window has theoretically moved forward.
+Can be specified as either an `int` representing milliseconds
+or a `timedelta` object.
+>***NOTE:*** `timedelta` objects will be rounded to the closest millisecond
+value.
+- `name`: The unique identifier for the window. If not provided, it will be
+automatically generated based on the window's properties.
-
+**Returns**:
-## quixstreams.dataframe.exceptions
+`HoppingWindowDefinition` instance representing the hopping
+window configuration.
+This object can be further configured with aggregation functions
+like `sum`, `count`, etc. and applied to the StreamingDataFrame.
-
+
-## quixstreams.dataframe.dataframe
+## quixstreams.dataframe.series
-
+
-### StreamingDataFrame
+### StreamingSeries
```python
-class StreamingDataFrame(BaseStreaming)
+class StreamingSeries(BaseStreaming)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/dataframe.py#L32)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/series.py#L40)
-`StreamingDataFrame` is the main object you will use for ETL work.
+`StreamingSeries` are typically generated by `StreamingDataframes` when getting
+elements from, or performing certain operations on, a `StreamingDataframe`,
+thus acting as a representation of "column" value.
-Typically created with an `app = quixstreams.app.Application()` instance,
-via `sdf = app.dataframe()`.
+They share some operations with the `StreamingDataframe`, but also provide some
+additional functionality.
+
+Most column value operations are handled by this class, and `StreamingSeries` can
+generate other `StreamingSeries` as a result of said operations.
What it Does:
-- Builds a data processing pipeline, declaratively (not executed immediately)
- - Executes this pipeline on inputs at runtime (Kafka message values)
-- Provides functions/interface similar to Pandas Dataframes/Series
-- Enables stateful processing (and manages everything related to it)
+- Allows ways to do simple operations with dataframe "column"/dictionary values:
+ - Basic ops like add, subtract, modulo, etc.
+- Enables comparisons/inequalities:
+ - Greater than, equals, etc.
+ - and/or, is/not operations
+- Can check for existence of columns in `StreamingDataFrames`
+- Enables chaining of various operations together
How to Use:
-Define various operations while continuously reassigning to itself (or new fields).
-
-These operations will generally transform your data, access/update state, or produce
-to kafka topics.
-
-We recommend your data structure to be "columnar" (aka a dict/JSON) in nature so
-that it works with the entire interface, but simple types like `ints`, `str`, etc.
-are also supported.
+For the most part, you may not even notice this class exists!
+They will naturally be created as a result of typical `StreamingDataFrame` use.
-See the various methods and classes for more specifics, or for a deep dive into
-usage, see `streamingdataframe.md` under the `docs/` folder.
+Auto-complete should help you with valid methods and type-checking should alert
+you to invalid operations between `StreamingSeries`.
->***NOTE:*** column referencing like `sdf["a_column"]` and various methods often
- create other object types (typically `quixstreams.dataframe.StreamingSeries`),
- which is expected; type hinting should alert you to any issues should you
- attempt invalid operations with said objects (however, we cannot infer whether
- an operation is valid with respect to your data!).
+In general, any typical Pands dataframe operation between columns should be valid
+with `StreamingSeries`, and you shouldn't have to think about them explicitly.
Example Snippet:
```python
+# Random methods for example purposes. More detailed explanations found under
+# various methods or in the docs folder.
+
sdf = StreamingDataframe()
-sdf = sdf.apply(a_func)
-sdf = sdf.filter(another_func)
-sdf = sdf.to_topic(topic_obj)
+sdf = sdf["column_a"].apply(a_func).apply(diff_func, stateful=True)
+sdf["my_new_bool_field"] = sdf["column_b"].contains("this_string")
+sdf["new_sum_field"] = sdf["column_c"] + sdf["column_d"] + 2
+sdf = sdf[["column_a"] & (sdf["new_sum_field"] >= 10)]
```
-
+
-#### StreamingDataFrame.apply
+#### StreamingSeries.from\_func
```python
-def apply(func: Union[DataFrameFunc, DataFrameStatefulFunc],
- stateful: bool = False,
- expand: bool = False) -> Self
+@classmethod
+def from_func(cls, func: StreamCallable) -> Self
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/dataframe.py#L109)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/series.py#L100)
-Apply a function to transform the value and return a new value.
+Create a StreamingSeries from a function.
-The result will be passed downstream as an input value.
+The provided function will be wrapped into `Apply`
+**Arguments**:
-Example Snippet:
+- `func`: a function to apply
-```python
-# This stores a string in state and capitalizes every column with a string value.
-# A second apply then keeps only the string value columns (shows non-stateful).
-def func(d: dict, state: State):
- value = d["store_field"]
- if value != state.get("my_store_key"):
- state.set("my_store_key") = value
- return {k: v.upper() if isinstance(v, str) else v for k, v in d.items()}
-
-sdf = StreamingDataframe()
-sdf = sdf.apply(func, stateful=True)
-sdf = sdf.apply(lambda d: {k: v for k,v in d.items() if isinstance(v, str)})
-
-```
-
-**Arguments**:
+**Returns**:
-- `func`: a function to apply
-- `stateful`: if `True`, the function will be provided with a second argument
-of type `State` to perform stateful operations.
-- `expand`: if True, expand the returned iterable into individual values
-downstream. If returned value is not iterable, `TypeError` will be raised.
-Default - `False`.
+instance of `StreamingSeries`
-
+
-#### StreamingDataFrame.update
+#### StreamingSeries.apply
```python
-def update(func: Union[DataFrameFunc, DataFrameStatefulFunc],
- stateful: bool = False) -> Self
+def apply(func: StreamCallable) -> Self
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/dataframe.py#L152)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/series.py#L114)
-Apply a function to mutate value in-place or to perform a side effect
+Add a callable to the execution list for this series.
-that doesn't update the value (e.g. print a value to the console).
+The provided callable should accept a single argument, which will be its input.
+The provided callable should similarly return one output, or None
-The result of the function will be ignored, and the original value will be
-passed downstream.
+They can be chained together or included with other operations.
Example Snippet:
```python
-# Stores a value and mutates a list by appending a new item to it.
-# Also prints to console.
+# The `StreamingSeries` are generated when `sdf["COLUMN_NAME"]` is called.
+# This stores a string in state and capitalizes the column value; the result is
+# assigned to a new column.
+# Another apply converts a str column to an int, assigning it to a new column.
-def func(values: list, state: State):
- value = values[0]
+def func(value: str, state: State):
if value != state.get("my_store_key"):
state.set("my_store_key") = value
- values.append("new_item")
+ return v.upper()
sdf = StreamingDataframe()
-sdf = sdf.update(func, stateful=True)
-sdf = sdf.update(lambda value: print("Received value: ", value))
+sdf["new_col"] = sdf["a_column"]["nested_dict_key"].apply(func, stateful=True)
+sdf["new_col_2"] = sdf["str_col"].apply(lambda v: int(v)) + sdf["str_col2"] + 2
```
**Arguments**:
-- `func`: function to update value
-- `stateful`: if `True`, the function will be provided with a second argument
-of type `State` to perform stateful operations.
+- `func`: a callable with one argument and one output
-
+**Returns**:
-#### StreamingDataFrame.filter
+a new `StreamingSeries` with the new callable added
+
+
+
+#### StreamingSeries.compose
```python
-def filter(func: Union[DataFrameFunc, DataFrameStatefulFunc],
- stateful: bool = False) -> Self
+def compose(allow_filters: bool = True,
+ allow_updates: bool = True) -> StreamCallable
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/dataframe.py#L191)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/series.py#L148)
-Filter value using provided function.
+Compose all functions of this StreamingSeries into one big closure.
-If the function returns True-like value, the original value will be
-passed downstream.
-Otherwise, the `Filtered` exception will be raised (further processing for that
-message will be skipped).
+Closures are more performant than calling all the functions in the
+`StreamingDataFrame` one-by-one.
+
+Generally not required by users; the `quixstreams.app.Application` class will
+do this automatically.
Example Snippet:
```python
-# Stores a value and allows further processing only if the value is greater than
-# what was previously stored.
+from quixstreams import Application
-def func(d: dict, state: State):
- value = d["my_value"]
- if value > state.get("my_store_key"):
- state.set("my_store_key") = value
- return True
- return False
+app = Application(...)
-sdf = StreamingDataframe()
-sdf = sdf.filter(func, stateful=True)
+sdf = app.dataframe()
+sdf = sdf["column_a"].apply(apply_func)
+sdf = sdf["column_b"].contains(filter_func)
+sdf = sdf.compose()
+
+result_0 = sdf({"my": "record"})
+result_1 = sdf({"other": "record"})
```
**Arguments**:
-- `func`: function to filter value
-- `stateful`: if `True`, the function will be provided with second argument
-of type `State` to perform stateful operations.
+- `allow_filters`: If False, this function will fail with ValueError if
+the stream has filter functions in the tree. Default - True.
+- `allow_updates`: If False, this function will fail with ValueError if
+the stream has update functions in the tree. Default - True.
-
+**Raises**:
-#### StreamingDataFrame.contains
+- `ValueError`: if disallowed functions are present in the tree of
+underlying `Stream`.
-```python
-@staticmethod
-def contains(key: str) -> StreamingSeries
-```
+**Returns**:
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/dataframe.py#L244)
+a function that accepts "value"
+and returns a result of `StreamingSeries`
-Check if the key is present in the Row value.
+
-Example Snippet:
+#### StreamingSeries.test
```python
-# Add new column 'has_column' which contains a boolean indicating
-# the presence of 'column_x'
-
-sdf = StreamingDataframe()
-sdf['has_column'] = sdf.contains('column_x')
+def test(value: Any, ctx: Optional[MessageContext] = None) -> Any
```
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/series.py#L195)
+
+A shorthand to test `StreamingSeries` with provided value
+
+and `MessageContext`.
+
**Arguments**:
-- `key`: a column name to check.
+- `value`: value to pass through `StreamingSeries`
+- `ctx`: instance of `MessageContext`, optional.
+Provide it if the StreamingSeries instance has
+functions calling `get_current_key()`.
+Default - `None`.
**Returns**:
-a Column object that evaluates to True if the key is present
-or False otherwise.
+result of `StreamingSeries`
-
+
-#### StreamingDataFrame.to\_topic
+#### StreamingSeries.isin
```python
-def to_topic(topic: Topic,
- key: Optional[Callable[[object], object]] = None) -> Self
+def isin(other: Container) -> Self
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/dataframe.py#L267)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/series.py#L231)
-Produce current value to a topic. You can optionally specify a new key.
+Check if series value is in "other".
->***NOTE:*** A `RowProducer` instance must be assigned to
-`StreamingDataFrame.producer` if not using :class:`quixstreams.app.Application`
- to facilitate the execution of StreamingDataFrame.
+Same as "StreamingSeries in other".
+
+Runtime result will be a `bool`.
Example Snippet:
@@ -1391,1151 +1369,1302 @@ Example Snippet:
```python
from quixstreams import Application
-# Produce to two different topics, changing the key for one of them.
-
-app = Application()
-input_topic = app.topic("input_x")
-output_topic_0 = app.topic("output_a")
-output_topic_1 = app.topic("output_b")
+# Check if "str_column" is contained in a column with a list of strings and
+# assign the resulting `bool` to a new column: "has_my_str".
-sdf = app.dataframe(input_topic)
-sdf = sdf.to_topic(output_topic_0)
-sdf = sdf.to_topic(output_topic_1, key=lambda data: data["a_field"])
+sdf = app.dataframe()
+sdf["has_my_str"] = sdf["str_column"].isin(sdf["column_with_list_of_strs"])
```
**Arguments**:
-- `topic`: instance of `Topic`
-- `key`: a callable to generate a new message key, optional.
-If passed, the return type of this callable must be serializable
-by `key_serializer` defined for this Topic object.
-By default, the current message key will be used.
+- `other`: a container to check
-
+**Returns**:
-#### StreamingDataFrame.compose
+new StreamingSeries
+
+
+
+#### StreamingSeries.contains
```python
-def compose() -> StreamCallable
+def contains(other: Union[Self, object]) -> Self
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/dataframe.py#L306)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/series.py#L258)
-Compose all functions of this StreamingDataFrame into one big closure.
+Check if series value contains "other"
-Closures are more performant than calling all the functions in the
-`StreamingDataFrame` one-by-one.
+Same as "other in StreamingSeries".
-Generally not required by users; the `quixstreams.app.Application` class will
-do this automatically.
+Runtime result will be a `bool`.
Example Snippet:
```python
from quixstreams import Application
-sdf = app.dataframe()
-sdf = sdf.apply(apply_func)
-sdf = sdf.filter(filter_func)
-sdf = sdf.compose()
-result_0 = sdf({"my": "record"})
-result_1 = sdf({"other": "record"})
+# Check if "column_a" contains "my_substring" and assign the resulting
+# `bool` to a new column: "has_my_substr"
+
+sdf = app.dataframe()
+sdf["has_my_substr"] = sdf["column_a"].contains("my_substring")
```
+**Arguments**:
+
+- `other`: object to check
+
**Returns**:
-a function that accepts "value"
-and returns a result of StreamingDataFrame
+new StreamingSeries
-
+
-#### StreamingDataFrame.test
+#### StreamingSeries.is\_
```python
-def test(value: object, ctx: Optional[MessageContext] = None) -> Any
+def is_(other: Union[Self, object]) -> Self
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/dataframe.py#L336)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/series.py#L283)
-A shorthand to test `StreamingDataFrame` with provided value
+Check if series value refers to the same object as `other`
-and `MessageContext`.
+Runtime result will be a `bool`.
+
+
+Example Snippet:
+
+```python
+# Check if "column_a" is the same as "column_b" and assign the resulting `bool`
+# to a new column: "is_same"
+
+from quixstreams import Application
+sdf = app.dataframe()
+sdf["is_same"] = sdf["column_a"].is_(sdf["column_b"])
+```
**Arguments**:
-- `value`: value to pass through `StreamingDataFrame`
-- `ctx`: instance of `MessageContext`, optional.
-Provide it if the StreamingDataFrame instance calls `to_topic()`,
-has stateful functions or functions calling `get_current_key()`.
-Default - `None`.
+- `other`: object to check for "is"
**Returns**:
-result of `StreamingDataFrame`
+new StreamingSeries
-
+
-#### StreamingDataFrame.tumbling\_window
+#### StreamingSeries.isnot
```python
-def tumbling_window(duration_ms: Union[int, timedelta],
- grace_ms: Union[int, timedelta] = 0,
- name: Optional[str] = None) -> TumblingWindowDefinition
+def isnot(other: Union[Self, object]) -> Self
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/dataframe.py#L354)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/series.py#L306)
-Create a tumbling window transformation on this StreamingDataFrame.
+Check if series value does not refer to the same object as `other`
-Tumbling windows divide time into fixed-sized, non-overlapping windows.
+Runtime result will be a `bool`.
-They allow to perform stateful aggregations like `sum`, `reduce`, etc.
-on top of the data and emit results downstream.
-Notes:
+Example Snippet:
-- Every window is grouped by the current Kafka message key.
-- Messages with `None` key will be ignored.
-- The time windows always use the current event time.
+```python
+from quixstreams import Application
+
+# Check if "column_a" is the same as "column_b" and assign the resulting `bool`
+# to a new column: "is_not_same"
+
+sdf = app.dataframe()
+sdf["is_not_same"] = sdf["column_a"].isnot(sdf["column_b"])
+```
+
+**Arguments**:
+
+- `other`: object to check for "is_not"
+
+**Returns**:
+
+new StreamingSeries
+
+
+
+#### StreamingSeries.isnull
+
+```python
+def isnull() -> Self
+```
+
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/series.py#L330)
+
+Check if series value is None.
+Runtime result will be a `bool`.
Example Snippet:
```python
-app = Application()
-sdf = app.dataframe(...)
+from quixstreams import Application
-sdf = (
- # Define a tumbling window of 60s and grace period of 10s
- sdf.tumbling_window(
- duration_ms=timedelta(seconds=60), grace_ms=timedelta(seconds=10.0)
- )
+# Check if "column_a" is null and assign the resulting `bool` to a new column:
+# "is_null"
- # Specify the aggregation function
- .sum()
+sdf = app.dataframe()
+sdf["is_null"] = sdf["column_a"].isnull()
+```
- # Specify how the results should be emitted downstream.
- # "all()" will emit results as they come for each updated window,
- # possibly producing multiple messages per key-window pair
- # "final()" will emit windows only when they are closed and cannot
- # receive any updates anymore.
- .all()
-)
+**Returns**:
+
+new StreamingSeries
+
+
+
+#### StreamingSeries.notnull
+
+```python
+def notnull() -> Self
+```
+
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/series.py#L353)
+
+Check if series value is not None.
+
+Runtime result will be a `bool`.
+
+
+Example Snippet:
+
+```python
+from quixstreams import Application
+
+# Check if "column_a" is not null and assign the resulting `bool` to a new column:
+# "is_not_null"
+
+sdf = app.dataframe()
+sdf["is_not_null"] = sdf["column_a"].notnull()
+```
+
+**Returns**:
+
+new StreamingSeries
+
+
+
+#### StreamingSeries.abs
+
+```python
+def abs() -> Self
+```
+
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/series.py#L376)
+
+Get absolute value of the series value.
+
+Example Snippet:
+
+```python
+from quixstreams import Application
+
+# Get absolute value of "int_col" and add it to "other_int_col".
+# Finally, assign the result to a new column: "abs_col_sum".
+
+sdf = app.dataframe()
+sdf["abs_col_sum"] = sdf["int_col"].abs() + sdf["other_int_col"]
+```
+
+**Returns**:
+
+new StreamingSeries
+
+
+
+## quixstreams.dataframe
+
+
+
+## quixstreams.dataframe.utils
+
+
+
+#### ensure\_milliseconds
+
+```python
+def ensure_milliseconds(delta: Union[int, timedelta]) -> int
```
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/utils.py#L5)
+
+Convert timedelta to milliseconds.
+
+If the `delta` is not
+This function will also round the value to the closest milliseconds in case of
+higher precision.
+
**Arguments**:
-- `duration_ms`: The length of each window.
-Can be specified as either an `int` representing milliseconds or a
-`timedelta` object.
->***NOTE:*** `timedelta` objects will be rounded to the closest millisecond
-value.
-- `grace_ms`: The grace period for data arrival.
-It allows late-arriving data (data arriving after the window
-has theoretically closed) to be included in the window.
-Can be specified as either an `int` representing milliseconds
-or as a `timedelta` object.
->***NOTE:*** `timedelta` objects will be rounded to the closest millisecond
-value.
-- `name`: The unique identifier for the window. If not provided, it will be
-automatically generated based on the window's properties.
+- `delta`: `timedelta` object
**Returns**:
-`TumblingWindowDefinition` instance representing the tumbling window
-configuration.
-This object can be further configured with aggregation functions
-like `sum`, `count`, etc. applied to the StreamingDataFrame.
+timedelta value in milliseconds as `int`
-
+
-#### StreamingDataFrame.hopping\_window
+## quixstreams.dataframe.exceptions
+
+
+
+## quixstreams.dataframe.windows.definitions
+
+
+
+### FixedTimeWindowDefinition
```python
-def hopping_window(duration_ms: Union[int, timedelta],
- step_ms: Union[int, timedelta],
- grace_ms: Union[int, timedelta] = 0,
- name: Optional[str] = None) -> HoppingWindowDefinition
+class FixedTimeWindowDefinition(abc.ABC)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/dataframe.py#L429)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/windows/definitions.py#L20)
-Create a hopping window transformation on this StreamingDataFrame.
+
-Hopping windows divide the data stream into overlapping windows based on time.
-The overlap is controlled by the `step_ms` parameter.
+#### FixedTimeWindowDefinition.sum
-They allow to perform stateful aggregations like `sum`, `reduce`, etc.
-on top of the data and emit results downstream.
+```python
+def sum() -> "FixedTimeWindow"
+```
-Notes:
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/windows/definitions.py#L67)
-- Every window is grouped by the current Kafka message key.
-- Messages with `None` key will be ignored.
-- The time windows always use the current event time.
+Configure the window to aggregate data by summing up values within
+each window period.
-Example Snippet:
+**Returns**:
+
+an instance of `FixedTimeWindow` configured to perform sum aggregation.
+
+
+
+#### FixedTimeWindowDefinition.count
```python
-app = Application()
-sdf = app.dataframe(...)
+def count() -> "FixedTimeWindow"
+```
-sdf = (
- # Define a hopping window of 60s with step 30s and grace period of 10s
- sdf.hopping_window(
- duration_ms=timedelta(seconds=60),
- step_ms=timedelta(seconds=30),
- grace_ms=timedelta(seconds=10)
- )
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/windows/definitions.py#L94)
- # Specify the aggregation function
- .sum()
+Configure the window to aggregate data by counting the number of values
- # Specify how the results should be emitted downstream.
- # "all()" will emit results as they come for each updated window,
- # possibly producing multiple messages per key-window pair
- # "final()" will emit windows only when they are closed and cannot
- # receive any updates anymore.
- .all()
+within each window period.
+
+**Returns**:
+
+an instance of `FixedTimeWindow` configured to perform record count.
+
+
+
+#### FixedTimeWindowDefinition.mean
+
+```python
+def mean() -> "FixedTimeWindow"
+```
+
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/windows/definitions.py#L121)
+
+Configure the window to aggregate data by calculating the mean of the values
+
+within each window period.
+
+**Returns**:
+
+an instance of `FixedTimeWindow` configured to calculate the mean
+of the values.
+
+
+
+#### FixedTimeWindowDefinition.reduce
+
+```python
+def reduce(reducer: Callable[[Any, Any], Any],
+ initializer: Callable[[Any], Any]) -> "FixedTimeWindow"
+```
+
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/windows/definitions.py#L152)
+
+Configure the window to perform a custom aggregation using `reducer`
+
+and `initializer` functions.
+
+Example Snippet:
+```python
+sdf = StreamingDataFrame(...)
+
+# Using "reduce()" to calculate multiple aggregates at once
+def reducer(agg: dict, current: int):
+ aggregated = {
+ 'min': min(agg['min'], current),
+ 'max': max(agg['max'], current)
+ 'count': agg['count'] + 1
+ }
+ return aggregated
+
+def initializer(current) -> dict:
+ return {'min': current, 'max': current, 'count': 1}
+
+window = (
+ sdf.tumbling_window(duration_ms=1000)
+ .reduce(reducer=reducer, initializer=initializer)
+ .final()
)
```
**Arguments**:
-- `duration_ms`: The length of each window. It defines the time span for
-which each window aggregates data.
-Can be specified as either an `int` representing milliseconds
-or a `timedelta` object.
->***NOTE:*** `timedelta` objects will be rounded to the closest millisecond
-value.
-- `step_ms`: The step size for the window.
-It determines how much each successive window moves forward in time.
-Can be specified as either an `int` representing milliseconds
-or a `timedelta` object.
->***NOTE:*** `timedelta` objects will be rounded to the closest millisecond
-value.
-- `grace_ms`: The grace period for data arrival.
-It allows late-arriving data to be included in the window,
-even if it arrives after the window has theoretically moved forward.
-Can be specified as either an `int` representing milliseconds
-or a `timedelta` object.
->***NOTE:*** `timedelta` objects will be rounded to the closest millisecond
-value.
-- `name`: The unique identifier for the window. If not provided, it will be
-automatically generated based on the window's properties.
+- `reducer`: A function that takes two arguments
+(the accumulated value and a new value) and returns a single value.
+The returned value will be saved to the state store and sent downstream.
+- `initializer`: A function to call for every first element of the window.
+This function is used to initialize the aggregation within a window.
**Returns**:
-`HoppingWindowDefinition` instance representing the hopping
-window configuration.
-This object can be further configured with aggregation functions
-like `sum`, `count`, etc. and applied to the StreamingDataFrame.
+A window configured to perform custom reduce aggregation on the data.
-
+
-## quixstreams.error\_callbacks
+#### FixedTimeWindowDefinition.max
-
+```python
+def max() -> "FixedTimeWindow"
+```
-## quixstreams.exceptions
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/windows/definitions.py#L212)
-
+Configure a window to aggregate the maximum value within each window period.
-## quixstreams.exceptions.base
+**Returns**:
-
+an instance of `FixedTimeWindow` configured to calculate the maximum
+value within each window period.
-## quixstreams.exceptions.assignment
+
-
+#### FixedTimeWindowDefinition.min
-### PartitionAssignmentError
+```python
+def min() -> "FixedTimeWindow"
+```
+
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/windows/definitions.py#L241)
+
+Configure a window to aggregate the minimum value within each window period.
+
+**Returns**:
+
+an instance of `FixedTimeWindow` configured to calculate the maximum
+value within each window period.
+
+
+
+## quixstreams.dataframe.windows
+
+
+
+## quixstreams.dataframe.windows.time\_based
+
+
+
+### FixedTimeWindow
```python
-class PartitionAssignmentError(QuixException)
+class FixedTimeWindow()
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/exceptions/assignment.py#L6)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/windows/time_based.py#L26)
-Error happened during partition rebalancing.
-Raised from `on_assign`, `on_revoke` and `on_lost` callbacks
+
-
+#### FixedTimeWindow.final
-## quixstreams.kafka
+```python
+def final(expand: bool = True) -> "StreamingDataFrame"
+```
-
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/windows/time_based.py#L89)
-## quixstreams.kafka.consumer
+Apply the window aggregation and return results only when the windows are
-
+closed.
-### Consumer
+The format of returned windows:
+```python
+{
+ "start": ,
+ "end": ,
+ "value: ,
+}
+```
+
+The individual window is closed when the event time
+(the maximum observed timestamp across the partition) passes
+its end timestamp + grace period.
+The closed windows cannot receive updates anymore and are considered final.
+
+>***NOTE:*** Windows can be closed only within the same message key.
+If some message keys appear irregularly in the stream, the latest windows
+can remain unprocessed until the message the same key is received.
+
+**Arguments**:
+
+- `expand`: if `True`, each window result will be sent downstream as
+an individual item. Otherwise, the list of window results will be sent.
+Default - `True`
+
+
+
+#### FixedTimeWindow.current
```python
-class Consumer()
+def current(expand: bool = True) -> "StreamingDataFrame"
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L66)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/windows/time_based.py#L126)
-
+Apply the window transformation to the StreamingDataFrame to return results
-#### Consumer.\_\_init\_\_
+for each updated window.
+The format of returned windows:
```python
-def __init__(broker_address: str,
- consumer_group: Optional[str],
- auto_offset_reset: AutoOffsetReset,
- auto_commit_enable: bool = True,
- assignment_strategy: AssignmentStrategy = "range",
- on_commit: Optional[Callable[
- [Optional[KafkaError], List[TopicPartition]], None]] = None,
- extra_config: Optional[dict] = None)
+{
+ "start": ,
+ "end": ,
+ "value: ,
+}
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L67)
+This method processes streaming data and returns results as they come,
+regardless of whether the window is closed or not.
-A wrapper around `confluent_kafka.Consumer`.
+**Arguments**:
-It initializes `confluent_kafka.Consumer` on demand
-avoiding network calls during `__init__`, provides typing info for methods
-and some reasonable defaults.
+- `expand`: if `True`, each window result will be sent downstream as
+an individual item. Otherwise, the list of window results will be sent.
+Default - `True`
+
+
+
+## quixstreams.dataframe.windows.base
+
+
+
+#### get\_window\_ranges
+
+```python
+def get_window_ranges(timestamp_ms: int,
+ duration_ms: int,
+ step_ms: Optional[int] = None) -> List[Tuple[int, int]]
+```
+
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/windows/base.py#L22)
+
+Get a list of window ranges for the given timestamp.
**Arguments**:
-- `broker_address`: Kafka broker host and port in format `:`.
-Passed as `bootstrap.servers` to `confluent_kafka.Consumer`.
-- `consumer_group`: Kafka consumer group.
-Passed as `group.id` to `confluent_kafka.Consumer`
-- `auto_offset_reset`: Consumer `auto.offset.reset` setting.
-Available values:
-- "earliest" - automatically reset the offset to the smallest offset
-- "latest" - automatically reset the offset to the largest offset
-- "error" - trigger an error (ERR__AUTO_OFFSET_RESET) which is retrieved
- by consuming messages (used for testing)
-- `auto_commit_enable`: If true, periodically commit offset of
-the last message handed to the application. Default - `True`.
-- `assignment_strategy`: The name of a partition assignment strategy.
-Available values: "range", "roundrobin", "cooperative-sticky".
-- `on_commit`: Offset commit result propagation callback.
-Passed as "offset_commit_cb" to `confluent_kafka.Consumer`.
-- `extra_config`: A dictionary with additional options that
-will be passed to `confluent_kafka.Consumer` as is.
-Note: values passed as arguments override values in `extra_config`.
+- `timestamp_ms`: timestamp in milliseconds
+- `duration_ms`: window duration in milliseconds
+- `step_ms`: window step in milliseconds for hopping windows, optional.
+
+**Returns**:
-
+a list of (, ) tuples
-#### Consumer.poll
+
-```python
-def poll(timeout: Optional[float] = None) -> Optional[Message]
-```
+## quixstreams.dataframe.base
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L126)
+
-Consumes a single message, calls callbacks and returns events.
+## quixstreams.rowproducer
-The application must check the returned :py:class:`Message`
-object's :py:func:`Message.error()` method to distinguish between proper
-messages (error() returns None), or an event or error.
+
-Note: Callbacks may be called from this method, such as
-``on_assign``, ``on_revoke``, et al.
+### RowProducer
-**Arguments**:
+```python
+class RowProducer()
+```
-- `timeout` (`float`): Maximum time in seconds to block waiting for message,
-event or callback. None or -1 is infinite. Default: None.
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/rowproducer.py#L14)
-**Raises**:
+A producer class that is capable of serializing Rows to bytes and send them to Kafka.
-- `None`: RuntimeError if called on a closed consumer
+The serialization is performed according to the Topic serialization settings.
-**Returns**:
+**Arguments**:
-A Message object or None on timeout
+- `broker_address`: Kafka broker host and port in format `:`.
+Passed as `bootstrap.servers` to `confluent_kafka.Producer`.
+- `partitioner`: A function to be used to determine the outgoing message
+partition.
+Available values: "random", "consistent_random", "murmur2", "murmur2_random",
+"fnv1a", "fnv1a_random"
+Default - "murmur2".
+- `extra_config`: A dictionary with additional options that
+will be passed to `confluent_kafka.Producer` as is.
+Note: values passed as arguments override values in `extra_config`.
+- `on_error`: a callback triggered when `RowProducer.produce_row()`
+or `RowProducer.poll()` fail`.
+If producer fails and the callback returns `True`, the exception
+will be logged but not propagated.
+The default callback logs an exception and returns `False`.
-
+
-#### Consumer.subscribe
+#### RowProducer.produce\_row
```python
-def subscribe(topics: List[str],
- on_assign: Optional[RebalancingCallback] = None,
- on_revoke: Optional[RebalancingCallback] = None,
- on_lost: Optional[RebalancingCallback] = None)
+def produce_row(row: Row,
+ topic: Topic,
+ key: Optional[Any] = None,
+ partition: Optional[int] = None,
+ timestamp: Optional[int] = None)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L144)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/rowproducer.py#L55)
-Set subscription to supplied list of topics
+Serialize Row to bytes according to the Topic serialization settings
-This replaces a previous subscription.
+and produce it to Kafka
+
+If this method fails, it will trigger the provided "on_error" callback.
**Arguments**:
-- `topics` (`list(str)`): List of topics (strings) to subscribe to.
-- `on_assign` (`callable`): callback to provide handling of customized offsets
-on completion of a successful partition re-assignment.
-- `on_revoke` (`callable`): callback to provide handling of offset commits to
-a customized store on the start of a rebalance operation.
-- `on_lost` (`callable`): callback to provide handling in the case the partition
-assignment has been lost. Partitions that have been lost may already be
-owned by other members in the group and therefore committing offsets,
-for example, may fail.
+- `row`: Row object
+- `topic`: Topic object
+- `key`: message key, optional
+- `partition`: partition number, optional
+- `timestamp`: timestamp in milliseconds, optional
-**Raises**:
+
-- `KafkaException`:
-- `None`: RuntimeError if called on a closed consumer
-.. py:function:: on_assign(consumer, partitions)
-.. py:function:: on_revoke(consumer, partitions)
-.. py:function:: on_lost(consumer, partitions)
+#### RowProducer.poll
- :param Consumer consumer: Consumer instance.
- :param list(TopicPartition) partitions: Absolute list of partitions being
- assigned or revoked.
+```python
+def poll(timeout: float = None)
+```
-
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/rowproducer.py#L92)
-#### Consumer.unsubscribe
+Polls the producer for events and calls `on_delivery` callbacks.
-```python
-def unsubscribe()
-```
+If `poll()` fails, it will trigger the provided "on_error" callback
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L238)
+**Arguments**:
-Remove current subscription.
+- `timeout`: timeout in seconds
-**Raises**:
+
-- `None`: KafkaException
-- `None`: RuntimeError if called on a closed consumer
+## quixstreams.core.stream.functions
-
+
-#### Consumer.store\_offsets
+### StreamFunction
```python
-def store_offsets(message: Optional[Message] = None,
- offsets: Optional[List[TopicPartition]] = None)
+class StreamFunction(abc.ABC)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L246)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/core/stream/functions.py#L26)
-.. py:function:: store_offsets([message=None], [offsets=None])
+A base class for all the streaming operations in Quix Streams.
-Store offsets for a message or a list of offsets.
+It provides two methods that return closures to be called on the input values:
+- `get_executor` - a wrapper to execute on a single value
+- `get_executor_expanded` - a wrapper to execute on an expanded value.
+ Expanded value is a list, where each item should be treated as a separate value.
-``message`` and ``offsets`` are mutually exclusive. The stored offsets
-will be committed according to 'auto.commit.interval.ms' or manual
-offset-less `commit`.
-Note that 'enable.auto.offset.store' must be set to False when using this API.
+
-**Arguments**:
+#### StreamFunction.func
-- `message` (`confluent_kafka.Message`): Store message's offset+1.
-- `offsets` (`list(TopicPartition)`): List of topic+partitions+offsets to store.
+```python
+@property
+def func() -> StreamCallable
+```
-**Raises**:
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/core/stream/functions.py#L43)
-- `None`: KafkaException
-- `None`: RuntimeError if called on a closed consumer
+The original function
-
+
-#### Consumer.commit
+#### StreamFunction.get\_executor
```python
-def commit(message: Optional[Message] = None,
- offsets: Optional[List[TopicPartition]] = None,
- asynchronous: bool = True) -> Optional[List[TopicPartition]]
+@abc.abstractmethod
+def get_executor() -> StreamCallable
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L280)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/core/stream/functions.py#L50)
-Commit a message or a list of offsets.
+Returns a wrapper to be called on a single value.
-The ``message`` and ``offsets`` parameters are mutually exclusive.
-If neither is set, the current partition assignment's offsets are used instead.
-Use this method to commit offsets if you have 'enable.auto.commit' set to False.
+
-**Arguments**:
+#### StreamFunction.get\_executor\_expanded
-- `message` (`confluent_kafka.Message`): Commit the message's offset+1.
-Note: By convention, committed offsets reflect the next message
-to be consumed, **not** the last message consumed.
-- `offsets` (`list(TopicPartition)`): List of topic+partitions+offsets to commit.
-- `asynchronous` (`bool`): If true, asynchronously commit, returning None
-immediately. If False, the commit() call will block until the commit
-succeeds or fails and the committed offsets will be returned (on success).
-Note that specific partitions may have failed and the .err field of
-each partition should be checked for success.
+```python
+@abc.abstractmethod
+def get_executor_expanded() -> StreamCallable
+```
-**Raises**:
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/core/stream/functions.py#L56)
-- `None`: KafkaException
-- `None`: RuntimeError if called on a closed consumer
+Returns a wrapper to be called on a list of expanded values.
-
+
-#### Consumer.committed
+### ApplyFunction
```python
-def committed(partitions: List[TopicPartition],
- timeout: Optional[float] = None) -> List[TopicPartition]
+class ApplyFunction(StreamFunction)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L320)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/core/stream/functions.py#L62)
-.. py:function:: committed(partitions, [timeout=None])
+Wrap a function into "Apply" function.
-Retrieve committed offsets for the specified partitions.
+The provided function is expected to return a new value based on input,
+and its result will always be passed downstream.
-**Arguments**:
+
-- `partitions` (`list(TopicPartition)`): List of topic+partitions to query for stored offsets.
-- `timeout` (`float`): Request timeout (seconds).
-None or -1 is infinite. Default: None
+### ApplyExpandFunction
-**Raises**:
+```python
+class ApplyExpandFunction(StreamFunction)
+```
-- `None`: KafkaException
-- `None`: RuntimeError if called on a closed consumer
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/core/stream/functions.py#L85)
-**Returns**:
+Wrap a function into "Apply" function and expand the returned iterable
+into separate values downstream.
-`list(TopicPartition)`: List of topic+partitions with offset and possibly error set.
+The provided function is expected to return an `Iterable`.
+If the returned value is not `Iterable`, `TypeError` will be raised.
-
+
-#### Consumer.get\_watermark\_offsets
+### FilterFunction
```python
-def get_watermark_offsets(partition: TopicPartition,
- timeout: Optional[float] = None,
- cached: bool = False) -> Tuple[int, int]
+class FilterFunction(StreamFunction)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L340)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/core/stream/functions.py#L114)
-Retrieve low and high offsets for the specified partition.
+Wraps a function into a "Filter" function.
+The result of a Filter function is interpreted as boolean.
+If it's `True`, the input will be return downstream.
+If it's `False`, the `Filtered` exception will be raised to signal that the
+value is filtered out.
-**Arguments**:
+
-- `partition` (`TopicPartition`): Topic+partition to return offsets for.
-- `timeout` (`float`): Request timeout (seconds). None or -1 is infinite.
-Ignored if cached=True. Default: None
-- `cached` (`bool`): Instead of querying the broker, use cached information.
-Cached values: The low offset is updated periodically
-(if statistics.interval.ms is set) while the high offset is updated on each
-message fetched from the broker for this partition.
+### UpdateFunction
-**Raises**:
+```python
+class UpdateFunction(StreamFunction)
+```
-- `None`: KafkaException
-- `None`: RuntimeError if called on a closed consumer
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/core/stream/functions.py#L146)
-**Returns**:
+Wrap a function into an "Update" function.
-`tuple(int,int)`: Tuple of (low,high) on success or None on timeout.
-The high offset is the offset of the last message + 1.
+The provided function is expected to mutate the value
+or to perform some side effect.
+Its result will always be ignored, and its input is passed
+downstream.
-
+
-#### Consumer.list\_topics
+#### compose
```python
-def list_topics(topic: Optional[str] = None,
- timeout: Optional[float] = None) -> ClusterMetadata
+def compose(functions: List[StreamFunction],
+ allow_filters: bool = True,
+ allow_updates: bool = True,
+ allow_expands: bool = True) -> StreamCallable
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L366)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/core/stream/functions.py#L175)
-.. py:function:: list_topics([topic=None], [timeout=-1])
+Composes a list of functions and its parents into a single
-Request metadata from the cluster.
-This method provides the same information as
-listTopics(), describeTopics() and describeCluster() in the Java Admin client.
+big closure like this:
+```
+[func, func, func] -> func(func(func()))
+```
+
+Closures are more performant than calling all functions one by one in a loop.
**Arguments**:
-- `topic` (`str`): If specified, only request information about this topic,
-else return results for all topics in cluster.
-Warning: If auto.create.topics.enable is set to true on the broker and
-an unknown topic is specified, it will be created.
-- `timeout` (`float`): The maximum response time before timing out
-None or -1 is infinite. Default: None
+- `functions`: list of `StreamFunction` objects to compose
+- `allow_filters`: If False, will fail with `ValueError` if
+the list has `FilterFunction`. Default - True.
+- `allow_updates`: If False, will fail with `ValueError` if
+the list has `UpdateFunction`. Default - True.
+- `allow_expands`: If False, will fail with `ValueError` if
+the list has `ApplyFunction` with "expand=True". Default - True.
**Raises**:
-- `None`: KafkaException
+- `ValueError`: if disallowed functions are present in the list of functions.
-
+
-#### Consumer.memberid
+#### composer
```python
-def memberid() -> str
+def composer(outer_func: StreamCallable,
+ inner_func: StreamCallable) -> Callable[[T], R]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L389)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/core/stream/functions.py#L227)
-Return this client's broker-assigned group member id.
+A function that wraps two other functions into a closure.
-The member id is assigned by the group coordinator and is propagated to
-the consumer during rebalance.
+It passes the result of the inner function as an input to the outer function.
- :returns: Member id string or None
- :rtype: string
- :raises: RuntimeError if called on a closed consumer
+**Returns**:
+a function with one argument (value)
-
+
-#### Consumer.offsets\_for\_times
+## quixstreams.core.stream
-```python
-def offsets_for_times(partitions: List[TopicPartition],
- timeout: Optional[float] = None) -> List[TopicPartition]
-```
+
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L402)
+## quixstreams.core.stream.stream
-Look up offsets by timestamp for the specified partitions.
+
-The returned offset for each partition is the earliest offset whose
-timestamp is greater than or equal to the given timestamp in the
-corresponding partition. If the provided timestamp exceeds that of the
-last message in the partition, a value of -1 will be returned.
+### Stream
- :param list(TopicPartition) partitions: topic+partitions with timestamps
- in the TopicPartition.offset field.
- :param float timeout: The maximum response time before timing out.
- None or -1 is infinite. Default: None
- :returns: List of topic+partition with offset field set and possibly error set
- :rtype: list(TopicPartition)
- :raises: KafkaException
- :raises: RuntimeError if called on a closed consumer
+```python
+class Stream()
+```
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/core/stream/stream.py#L22)
-
+
-#### Consumer.pause
+#### Stream.\_\_init\_\_
```python
-def pause(partitions: List[TopicPartition])
+def __init__(func: Optional[StreamFunction] = None,
+ parent: Optional[Self] = None)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L428)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/core/stream/stream.py#L23)
-Pause consumption for the provided list of partitions.
-
-Paused partitions must be tracked manually.
+A base class for all streaming operations.
-Does NOT affect the result of Consumer.assignment().
+`Stream` is an abstraction of a function pipeline.
+Each Stream has a function and a parent (None by default).
+When adding new function to the stream, it creates a new `Stream` object and
+sets "parent" to the previous `Stream` to maintain an order of execution.
-**Arguments**:
+Streams supports 3 types of functions:
+- "Apply" - generate new values based on a previous one.
+ The result of an Apply function is passed downstream to the next functions.
+ If "expand=True" is passed and the function returns an `Iterable`,
+ each item of it will be treated as a separate value downstream.
+- "Update" - update values in-place.
+ The result of an Update function is always ignored, and its input is passed
+ downstream.
+- "Filter" - to filter values from the Stream.
+ The result of a Filter function is interpreted as boolean.
+ If it's `True`, the input will be passed downstream.
+ If it's `False`, the `Filtered` exception will be raised to signal that the
+ value is filtered out.
-- `partitions` (`list(TopicPartition)`): List of topic+partitions to pause.
+To execute the functions on the `Stream`, call `.compose()` method, and
+it will return a closure to execute all the functions accumulated in the Stream
+and its parents.
-**Raises**:
+**Arguments**:
-- `None`: KafkaException
+- `func`: a function to be called on the stream.
+It is expected to be wrapped into one of "Apply", "Filter" or "Update" from
+`quixstreams.core.stream.functions` package.
+Default - "Apply(lambda v: v)".
+- `parent`: a parent `Stream`
-
+
-#### Consumer.resume
+#### Stream.add\_filter
```python
-def resume(partitions: List[TopicPartition])
+def add_filter(func: Callable[[T], R]) -> Self
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L442)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/core/stream/stream.py#L79)
-.. py:function:: resume(partitions)
+Add a function to filter values from the Stream.
-Resume consumption for the provided list of partitions.
+The return value of the function will be interpreted as `bool`.
+If the function returns `False`-like result, the Stream will raise `Filtered`
+exception during execution.
**Arguments**:
-- `partitions` (`list(TopicPartition)`): List of topic+partitions to resume.
+- `func`: a function to filter values from the stream
-**Raises**:
+**Returns**:
-- `None`: KafkaException
+a new `Stream` derived from the current one
-
+
-#### Consumer.position
+#### Stream.add\_apply
```python
-def position(partitions: List[TopicPartition]) -> List[TopicPartition]
+def add_apply(func: Callable[[T], R], expand: bool = False) -> Self
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L454)
-
-Retrieve current positions (offsets) for the specified partitions.
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/core/stream/stream.py#L92)
-**Arguments**:
+Add an "apply" function to the Stream.
-- `partitions` (`list(TopicPartition)`): List of topic+partitions to return
-current offsets for. The current offset is the offset of
-the last consumed message + 1.
+The function is supposed to return a new value, which will be passed
+further during execution.
-**Raises**:
+**Arguments**:
-- `None`: KafkaException
-- `None`: RuntimeError if called on a closed consumer
+- `func`: a function to generate a new value
+- `expand`: if True, expand the returned iterable into individual values
+downstream. If returned value is not iterable, `TypeError` will be raised.
+Default - `False`.
**Returns**:
-`list(TopicPartition)`: List of topic+partitions with offset and possibly error set.
+a new `Stream` derived from the current one
-
+
-#### Consumer.seek
+#### Stream.add\_update
```python
-def seek(partition: TopicPartition)
+def add_update(func: Callable[[T], object]) -> Self
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L468)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/core/stream/stream.py#L109)
-Set consume position for partition to offset.
-
-The offset may be an absolute (>=0) or a
-logical offset (:py:const:`OFFSET_BEGINNING` et.al).
+Add an "update" function to the Stream, that will mutate the input value.
-seek() may only be used to update the consume offset of an
-actively consumed partition (i.e., after :py:const:`assign()`),
-to set the starting offset of partition not being consumed instead
-pass the offset in an `assign()` call.
+The return of this function will be ignored and its input
+will be passed downstream.
**Arguments**:
-- `partition` (`TopicPartition`): Topic+partition+offset to seek to.
+- `func`: a function to mutate the value
-**Raises**:
+**Returns**:
-- `None`: KafkaException
+a new Stream derived from the current one
-
+
-#### Consumer.assignment
+#### Stream.diff
```python
-def assignment() -> List[TopicPartition]
+def diff(other: "Stream") -> Self
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L485)
-
-Returns the current partition assignment.
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/core/stream/stream.py#L121)
-**Raises**:
+Takes the difference between Streams `self` and `other` based on their last
-- `None`: KafkaException
-- `None`: RuntimeError if called on a closed consumer
+common parent, and returns a new `Stream` that includes only this difference.
-**Returns**:
+It's impossible to calculate a diff when:
+ - Streams don't have a common parent.
+ - When the `self` Stream already includes all the nodes from
+ the `other` Stream, and the resulting diff is empty.
-`list(TopicPartition)`: List of assigned topic+partitions.
+**Arguments**:
-
+- `other`: a `Stream` to take a diff from.
-#### Consumer.set\_sasl\_credentials
+**Raises**:
-```python
-def set_sasl_credentials(username: str, password: str)
-```
+- `ValueError`: if Streams don't have a common parent
+or if the diff is empty.
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L498)
+**Returns**:
-Sets the SASL credentials used for this client.
-These credentials will overwrite the old ones, and will be used the next
-time the client needs to authenticate.
-This method will not disconnect existing broker connections that have been
-established with the old credentials.
-This method is applicable only to SASL PLAIN and SCRAM mechanisms.
+new `Stream` instance including all the Streams from the diff
-
+
-#### Consumer.incremental\_assign
+#### Stream.tree
```python
-def incremental_assign(partitions: List[TopicPartition])
+def tree() -> List[Self]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L510)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/core/stream/stream.py#L150)
-Assign new partitions.
+Return a list of all parent Streams including the node itself.
-Can be called outside the `Consumer` `on_assign` callback (multiple times).
-Partitions immediately show on `Consumer.assignment()`.
+The tree is ordered from child to parent (current node comes first).
-Any additional partitions besides the ones passed during the `Consumer`
-`on_assign` callback will NOT be associated with the consumer group.
+**Returns**:
-
+a list of `Stream` objects
-#### Consumer.incremental\_unassign
+
+
+#### Stream.compose
```python
-def incremental_unassign(partitions: List[TopicPartition])
+def compose(allow_filters: bool = True,
+ allow_updates: bool = True,
+ allow_expands: bool = True) -> Callable[[T], R]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L522)
-
-Revoke partitions.
-
-Can be called outside an on_revoke callback.
-
-
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/core/stream/stream.py#L164)
-#### Consumer.close
+Compose a list of functions from this `Stream` and its parents into one
-```python
-def close()
-```
+big closure using a "composer" function.
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L530)
+Closures are more performant than calling all the functions in the
+`Stream.tree()` one-by-one.
-Close down and terminate the Kafka Consumer.
+**Arguments**:
-Actions performed:
+- `allow_filters`: If False, this function will fail with `ValueError` if
+the stream has filter functions in the tree. Default - True.
+- `allow_updates`: If False, this function will fail with `ValueError` if
+the stream has update functions in the tree. Default - True.
+- `allow_expands`: If False, this function will fail with `ValueError` if
+the stream has functions with "expand=True" in the tree. Default - True.
-- Stops consuming.
-- Commits offsets, unless the consumer property 'enable.auto.commit' is set to False.
-- Leaves the consumer group.
+**Raises**:
-Registered callbacks may be called from this method,
-see `poll()` for more info.
+- `ValueError`: if disallowed functions are present in the stream tree.
+
-
+## quixstreams.core
-## quixstreams.kafka.producer
+
-
+## quixstreams.processing\_context
-### Producer
+
+
+### ProcessingContext
```python
-class Producer()
+@dataclasses.dataclass
+class ProcessingContext()
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/producer.py#L54)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/processing_context.py#L21)
-
+A class to share processing-related objects
+between `Application` and `StreamingDataFrame` instances.
-#### Producer.\_\_init\_\_
+
+
+#### ProcessingContext.store\_offset
```python
-def __init__(broker_address: str,
- partitioner: Partitioner = "murmur2",
- extra_config: Optional[dict] = None)
+def store_offset(topic: str, partition: int, offset: int)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/producer.py#L55)
-
-A wrapper around `confluent_kafka.Producer`.
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/processing_context.py#L41)
-It initializes `confluent_kafka.Producer` on demand
-avoiding network calls during `__init__`, provides typing info for methods
-and some reasonable defaults.
+Store the offset of the processed message to the checkpoint.
**Arguments**:
-- `broker_address`: Kafka broker host and port in format `:`.
-Passed as `bootstrap.servers` to `confluent_kafka.Producer`.
-- `partitioner`: A function to be used to determine the outgoing message
-partition.
-Available values: "random", "consistent_random", "murmur2", "murmur2_random",
-"fnv1a", "fnv1a_random"
-Default - "murmur2".
-- `extra_config`: A dictionary with additional options that
-will be passed to `confluent_kafka.Producer` as is.
-Note: values passed as arguments override values in `extra_config`.
+- `topic`: topic name
+- `partition`: partition number
+- `offset`: message offset
-
+
-#### Producer.produce
+#### ProcessingContext.init\_checkpoint
```python
-def produce(topic: str,
- value: Optional[Union[str, bytes]] = None,
- key: Optional[Union[str, bytes]] = None,
- headers: Optional[Headers] = None,
- partition: Optional[int] = None,
- timestamp: Optional[int] = None,
- poll_timeout: float = 5.0,
- buffer_error_max_tries: int = 3)
+def init_checkpoint()
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/producer.py#L94)
-
-Produce message to topic.
-
-It also polls Kafka for callbacks before producing in order to minimize
-the probability of `BufferError`.
-If `BufferError` still happens, the method will poll Kafka with timeout
-to free up the buffer and try again.
-
-**Arguments**:
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/processing_context.py#L51)
-- `topic`: topic name
-- `value`: message value
-- `key`: message key
-- `headers`: message headers
-- `partition`: topic partition
-- `timestamp`: message timestamp
-- `poll_timeout`: timeout for `poll()` call in case of `BufferError`
-- `buffer_error_max_tries`: max retries for `BufferError`.
-Pass `0` to not retry after `BufferError`.
+Initialize a new checkpoint
-
+
-#### Producer.poll
+#### ProcessingContext.commit\_checkpoint
```python
-def poll(timeout: float = 0)
+def commit_checkpoint(force: bool = False)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/producer.py#L152)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/processing_context.py#L62)
-Polls the producer for events and calls `on_delivery` callbacks.
+Commit the current checkpoint.
-**Arguments**:
+The actual commit will happen only when:
-- `timeout`: poll timeout seconds; Default: 0 (unlike others)
-> NOTE: -1 will hang indefinitely if there are no messages to acknowledge
+1. The checkpoint has at least one stored offset
+2. The checkpoint is expired or `force=True` is passed
-
+**Arguments**:
-#### Producer.flush
+- `force`: if `True`, commit the checkpoint before its expiration deadline.
-```python
-def flush(timeout: Optional[float] = None) -> int
-```
+
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/producer.py#L160)
+## quixstreams.utils
-Wait for all messages in the Producer queue to be delivered.
+
-**Arguments**:
+## quixstreams.utils.dicts
-- `timeout` (`float`): time to attempt flushing (seconds).
-None or -1 is infinite. Default: None
+
-**Returns**:
+#### dict\_values
-number of messages remaining to flush
+```python
+def dict_values(d: object) -> List
+```
-
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/utils/dicts.py#L4)
-## quixstreams.models
+Recursively unpacks a set of nested dicts to get a flattened list of leaves,
-
+where "leaves" are the first non-dict item.
-## quixstreams.models.serializers
+i.e {"a": {"b": {"c": 1}, "d": 2}, "e": 3} becomes [1, 2, 3]
-
+**Arguments**:
-## quixstreams.models.serializers.json
+- `d`: initially, a dict (with potentially nested dicts)
-
+**Returns**:
-### JSONSerializer
+a list with all the leaves of the various contained dicts
-```python
-class JSONSerializer(Serializer)
-```
+
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/json.py#L13)
+## quixstreams.utils.json
-
+
-#### JSONSerializer.\_\_init\_\_
+#### dumps
```python
-def __init__(dumps: Callable[[Any], Union[str, bytes]] = default_dumps)
+def dumps(value: Any) -> bytes
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/json.py#L14)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/utils/json.py#L8)
-Serializer that returns data in json format.
+Serialize to JSON using `orjson` package.
**Arguments**:
-- `dumps`: a function to serialize objects to json.
-Default - :py:func:`quixstreams.utils.json.dumps`
+- `value`: value to serialize to JSON
-
+**Returns**:
-### JSONDeserializer
+bytes
+
+
+
+#### loads
```python
-class JSONDeserializer(Deserializer)
+def loads(value: bytes) -> Any
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/json.py#L35)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/utils/json.py#L18)
-
+Deserialize from JSON using `orjson` package.
-#### JSONDeserializer.\_\_init\_\_
+Main differences:
+- It returns `bytes`
+- It doesn't allow non-str keys in dictionaries
-```python
-def __init__(column_name: Optional[str] = None,
- loads: Callable[[Union[bytes, bytearray]], Any] = default_loads)
-```
+**Arguments**:
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/json.py#L36)
+- `value`: value to deserialize from
-Deserializer that parses data from JSON
+**Returns**:
-**Arguments**:
+object
-- `column_name`: if provided, the deserialized value will be wrapped into
-dictionary with `column_name` as a key.
-- `loads`: function to parse json from bytes.
-Default - :py:func:`quixstreams.utils.json.loads`.
+
-
+## quixstreams.types
-## quixstreams.models.serializers.simple\_types
+
-
+## quixstreams.models.timestamps
-### BytesDeserializer
+
+
+### TimestampType
```python
-class BytesDeserializer(Deserializer)
+class TimestampType(enum.IntEnum)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/simple_types.py#L44)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/timestamps.py#L8)
-A deserializer to bypass bytes without any changes
+
-
+#### TIMESTAMP\_NOT\_AVAILABLE
-### BytesSerializer
+timestamps not supported by broker
-```python
-class BytesSerializer(Serializer)
-```
+
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/simple_types.py#L55)
+#### TIMESTAMP\_CREATE\_TIME
-A serializer to bypass bytes without any changes
+message creation time (or source / producer time)
-
+
-### StringDeserializer
+#### TIMESTAMP\_LOG\_APPEND\_TIME
+
+broker receive time
+
+
+
+### MessageTimestamp
```python
-class StringDeserializer(Deserializer)
+class MessageTimestamp()
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/simple_types.py#L64)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/timestamps.py#L14)
-
+Represents a timestamp of incoming Kafka message.
-#### StringDeserializer.\_\_init\_\_
+It is made pseudo-immutable (i.e. public attributes don't have setters), and
+it should not be mutated during message processing.
+
+
+
+#### MessageTimestamp.create
```python
-def __init__(column_name: Optional[str] = None, codec: str = "utf_8")
+@classmethod
+def create(cls, timestamp_type: int, milliseconds: int) -> Self
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/simple_types.py#L65)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/timestamps.py#L41)
-Deserializes bytes to strings using the specified encoding.
+Create a Timestamp object based on data
+
+from `confluent_kafka.Message.timestamp()`.
+
+If timestamp type is "TIMESTAMP_NOT_AVAILABLE", the milliseconds are set to None
**Arguments**:
-- `codec`: string encoding
-A wrapper around `confluent_kafka.serialization.StringDeserializer`.
+- `timestamp_type`: a timestamp type represented as a number
+Can be one of:
+- "0" - TIMESTAMP_NOT_AVAILABLE, timestamps not supported by broker.
+- "1" - TIMESTAMP_CREATE_TIME, message creation time (or source / producer time).
+- "2" - TIMESTAMP_LOG_APPEND_TIME, broker receive time.
+- `milliseconds`: the number of milliseconds since the epoch (UTC).
-
+**Returns**:
-### IntegerDeserializer
+Timestamp object
-```python
-class IntegerDeserializer(Deserializer)
-```
+
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/simple_types.py#L84)
+## quixstreams.models
-Deserializes bytes to integers.
+
-A wrapper around `confluent_kafka.serialization.IntegerDeserializer`.
+## quixstreams.models.messagecontext
-
+
-### DoubleDeserializer
+### MessageContext
```python
-class DoubleDeserializer(Deserializer)
+class MessageContext()
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/simple_types.py#L103)
-
-Deserializes float to IEEE 764 binary64.
-
-A wrapper around `confluent_kafka.serialization.DoubleDeserializer`.
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/messagecontext.py#L7)
-
+An object with Kafka message properties.
-### StringSerializer
+It is made pseudo-immutable (i.e. public attributes don't have setters), and
+it should not be mutated during message processing.
-```python
-class StringSerializer(Serializer)
-```
+
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/simple_types.py#L122)
+## quixstreams.models.types
-
+
-#### StringSerializer.\_\_init\_\_
+### ConfluentKafkaMessageProto
```python
-def __init__(codec: str = "utf_8")
+class ConfluentKafkaMessageProto(Protocol)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/simple_types.py#L123)
-
-Serializes strings to bytes using the specified encoding.
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/types.py#L12)
-**Arguments**:
+An interface of `confluent_kafka.Message`.
-- `codec`: string encoding
+Use it to not depend on exact implementation and simplify testing.
-
+Instances of `confluent_kafka.Message` cannot be directly created from Python,
+see https://github.com/confluentinc/confluent-kafka-python/issues/1535.
-### IntegerSerializer
+
-```python
-class IntegerSerializer(Serializer)
-```
+## quixstreams.models.serializers
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/simple_types.py#L135)
+
-Serializes integers to bytes
+## quixstreams.models.serializers.exceptions
-
+
-### DoubleSerializer
+### IgnoreMessage
```python
-class DoubleSerializer(Serializer)
+class IgnoreMessage(exceptions.QuixException)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/simple_types.py#L148)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/exceptions.py#L46)
-Serializes floats to bytes
+Raise this exception from Deserializer.__call__ in order to ignore the processing
+of the particular message.
@@ -2549,7 +2678,7 @@ Serializes floats to bytes
class QuixDeserializer(JSONDeserializer)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/quix.py#L73)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/quix.py#L73)
Handles Deserialization for any Quix-formatted topic.
@@ -2564,7 +2693,7 @@ def __init__(column_name: Optional[str] = None,
loads: Callable[[Union[bytes, bytearray]], Any] = default_loads)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/quix.py#L80)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/quix.py#L80)
**Arguments**:
@@ -2582,7 +2711,7 @@ Default - :py:func:`quixstreams.utils.json.loads`.
def split_values() -> bool
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/quix.py#L100)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/quix.py#L100)
Each Quix message might contain data for multiple Rows.
This property informs the downstream processors about that, so they can
@@ -2597,7 +2726,7 @@ def deserialize(model_key: str, value: Union[List[Mapping],
Mapping]) -> Iterable[Mapping]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/quix.py#L153)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/quix.py#L153)
Deserialization function for particular data types (Timeseries or EventData).
@@ -2618,7 +2747,7 @@ Iterable of dicts
class QuixSerializer(JSONSerializer)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/quix.py#L274)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/quix.py#L274)
@@ -2629,7 +2758,7 @@ def __init__(as_legacy: bool = True,
dumps: Callable[[Any], Union[str, bytes]] = default_dumps)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/quix.py#L278)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/quix.py#L278)
Serializer that returns data in json format.
@@ -2647,7 +2776,7 @@ Default - :py:func:`quixstreams.utils.json.dumps`
class QuixTimeseriesSerializer(QuixSerializer)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/quix.py#L321)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/quix.py#L321)
Serialize data to JSON formatted according to Quix Timeseries format.
@@ -2679,7 +2808,7 @@ Output:
class QuixEventsSerializer(QuixSerializer)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/quix.py#L409)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/quix.py#L409)
Serialize data to JSON formatted according to Quix EventData format.
The input value is expected to be a dictionary with the following keys:
@@ -2708,353 +2837,363 @@ Output:
}
```
-
+
-## quixstreams.models.serializers.base
+## quixstreams.models.serializers.simple\_types
-
+
-### SerializationContext
+### BytesDeserializer
```python
-class SerializationContext()
+class BytesDeserializer(Deserializer)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/base.py#L22)
-
-Provides additional context for message serialization/deserialization.
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/simple_types.py#L44)
-Every `Serializer` and `Deserializer` receives an instance of `SerializationContext`
+A deserializer to bypass bytes without any changes
-
+
-#### SerializationContext.to\_confluent\_ctx
+### BytesSerializer
```python
-def to_confluent_ctx(field: MessageField) -> _SerializationContext
+class BytesSerializer(Serializer)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/base.py#L35)
-
-Convert `SerializationContext` to `confluent_kafka.SerializationContext`
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/simple_types.py#L55)
-in order to re-use serialization already provided by `confluent_kafka` library.
+A serializer to bypass bytes without any changes
-**Arguments**:
+
-- `field`: instance of `confluent_kafka.serialization.MessageField`
+### StringDeserializer
-**Returns**:
+```python
+class StringDeserializer(Deserializer)
+```
-instance of `confluent_kafka.serialization.SerializationContext`
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/simple_types.py#L64)
-
+
-### Deserializer
+#### StringDeserializer.\_\_init\_\_
```python
-class Deserializer(abc.ABC)
+def __init__(column_name: Optional[str] = None, codec: str = "utf_8")
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/base.py#L47)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/simple_types.py#L65)
-
+Deserializes bytes to strings using the specified encoding.
-#### Deserializer.\_\_init\_\_
+**Arguments**:
+
+- `codec`: string encoding
+A wrapper around `confluent_kafka.serialization.StringDeserializer`.
+
+
+
+### IntegerDeserializer
```python
-def __init__(column_name: Optional[str] = None, *args, **kwargs)
+class IntegerDeserializer(Deserializer)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/base.py#L48)
-
-A base class for all Deserializers
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/simple_types.py#L84)
-**Arguments**:
+Deserializes bytes to integers.
-- `column_name`: if provided, the deserialized value will be wrapped into
-dictionary with `column_name` as a key.
+A wrapper around `confluent_kafka.serialization.IntegerDeserializer`.
-
+
-#### Deserializer.split\_values
+### DoubleDeserializer
```python
-@property
-def split_values() -> bool
+class DoubleDeserializer(Deserializer)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/base.py#L58)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/simple_types.py#L103)
-Return True if the deserialized message should be considered as Iterable
-and each item in it should be processed as a separate message.
+Deserializes float to IEEE 764 binary64.
-
+A wrapper around `confluent_kafka.serialization.DoubleDeserializer`.
-### Serializer
+
+
+### StringSerializer
```python
-class Serializer(abc.ABC)
+class StringSerializer(Serializer)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/base.py#L74)
-
-A base class for all Serializers
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/simple_types.py#L122)
-
+
-#### Serializer.extra\_headers
+#### StringSerializer.\_\_init\_\_
```python
-@property
-def extra_headers() -> MessageHeadersMapping
+def __init__(codec: str = "utf_8")
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/base.py#L80)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/simple_types.py#L123)
-Informs producer to set additional headers
+Serializes strings to bytes using the specified encoding.
-for the message it will be serializing
+**Arguments**:
-Must return a dictionary with headers.
-Keys must be strings, and values must be strings, bytes or None.
+- `codec`: string encoding
-**Returns**:
+
-dict with headers
+### IntegerSerializer
-
+```python
+class IntegerSerializer(Serializer)
+```
-## quixstreams.models.serializers.exceptions
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/simple_types.py#L135)
-
+Serializes integers to bytes
-### IgnoreMessage
+
+
+### DoubleSerializer
```python
-class IgnoreMessage(exceptions.QuixException)
+class DoubleSerializer(Serializer)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/exceptions.py#L46)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/simple_types.py#L148)
-Raise this exception from Deserializer.__call__ in order to ignore the processing
-of the particular message.
+Serializes floats to bytes
-
+
-## quixstreams.models.topics
+## quixstreams.models.serializers.json
-
+
-## quixstreams.models.topics.manager
+### JSONSerializer
-
+```python
+class JSONSerializer(Serializer)
+```
-#### affirm\_ready\_for\_create
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/json.py#L13)
+
+
+
+#### JSONSerializer.\_\_init\_\_
```python
-def affirm_ready_for_create(topics: List[Topic])
+def __init__(dumps: Callable[[Any], Union[str, bytes]] = default_dumps)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/manager.py#L19)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/json.py#L14)
-Validate a list of topics is ready for creation attempt
+Serializer that returns data in json format.
**Arguments**:
-- `topics`: list of `Topic`s
+- `dumps`: a function to serialize objects to json.
+Default - :py:func:`quixstreams.utils.json.dumps`
-
+
-### TopicManager
+### JSONDeserializer
```python
-class TopicManager()
+class JSONDeserializer(Deserializer)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/manager.py#L29)
-
-The source of all topic management with quixstreams.
-
-Generally initialized and managed automatically by an `Application`,
-but allows a user to work with it directly when needed, such as using it alongside
-a plain `Producer` to create its topics.
-
-See methods for details.
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/json.py#L35)
-
+
-#### TopicManager.\_\_init\_\_
+#### JSONDeserializer.\_\_init\_\_
```python
-def __init__(topic_admin: TopicAdmin, create_timeout: int = 60)
+def __init__(column_name: Optional[str] = None,
+ loads: Callable[[Union[bytes, bytearray]], Any] = default_loads)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/manager.py#L48)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/json.py#L36)
+
+Deserializer that parses data from JSON
**Arguments**:
-- `topic_admin`: an `Admin` instance (required for some functionality)
-- `create_timeout`: timeout for topic creation
+- `column_name`: if provided, the deserialized value will be wrapped into
+dictionary with `column_name` as a key.
+- `loads`: function to parse json from bytes.
+Default - :py:func:`quixstreams.utils.json.loads`.
-
+
-#### TopicManager.changelog\_topics
+## quixstreams.models.serializers.base
+
+
+
+### SerializationContext
```python
-@property
-def changelog_topics() -> Dict[str, Dict[str, Topic]]
+class SerializationContext()
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/manager.py#L71)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/base.py#L22)
-Note: `Topic`s are the changelogs.
+Provides additional context for message serialization/deserialization.
-returns: the changelog topic dict, {topic_name: {suffix: Topic}}
+Every `Serializer` and `Deserializer` receives an instance of `SerializationContext`
-
+
-#### TopicManager.topic\_config
+#### SerializationContext.to\_confluent\_ctx
```python
-def topic_config(num_partitions: Optional[int] = None,
- replication_factor: Optional[int] = None,
- extra_config: Optional[dict] = None) -> TopicConfig
+def to_confluent_ctx(field: MessageField) -> _SerializationContext
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/manager.py#L121)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/base.py#L35)
-Convenience method for generating a `TopicConfig` with default settings
+Convert `SerializationContext` to `confluent_kafka.SerializationContext`
+
+in order to re-use serialization already provided by `confluent_kafka` library.
**Arguments**:
-- `num_partitions`: the number of topic partitions
-- `replication_factor`: the topic replication factor
-- `extra_config`: other optional configuration settings
+- `field`: instance of `confluent_kafka.serialization.MessageField`
**Returns**:
-a TopicConfig object
+instance of `confluent_kafka.serialization.SerializationContext`
-
+
-#### TopicManager.topic
+### Deserializer
```python
-def topic(name: str,
- value_deserializer: Optional[DeserializerType] = None,
- key_deserializer: Optional[DeserializerType] = "bytes",
- value_serializer: Optional[SerializerType] = None,
- key_serializer: Optional[SerializerType] = "bytes",
- config: Optional[TopicConfig] = None,
- timestamp_extractor: Optional[TimestampExtractor] = None) -> Topic
+class Deserializer(abc.ABC)
+```
+
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/base.py#L47)
+
+
+
+#### Deserializer.\_\_init\_\_
+
+```python
+def __init__(column_name: Optional[str] = None, *args, **kwargs)
+```
+
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/base.py#L48)
+
+A base class for all Deserializers
+
+**Arguments**:
+
+- `column_name`: if provided, the deserialized value will be wrapped into
+dictionary with `column_name` as a key.
+
+
+
+#### Deserializer.split\_values
+
+```python
+@property
+def split_values() -> bool
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/manager.py#L142)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/base.py#L58)
-A convenience method for generating a `Topic`. Will use default config options
+Return True if the deserialized message should be considered as Iterable
+and each item in it should be processed as a separate message.
-as dictated by the TopicManager.
+
-**Arguments**:
+### Serializer
-- `name`: topic name
-- `value_deserializer`: a deserializer type for values
-- `key_deserializer`: a deserializer type for keys
-- `value_serializer`: a serializer type for values
-- `key_serializer`: a serializer type for keys
-- `config`: optional topic configurations (for creation/validation)
-- `timestamp_extractor`: a callable that returns a timestamp in
-milliseconds from a deserialized message.
+```python
+class Serializer(abc.ABC)
+```
-**Returns**:
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/base.py#L74)
-Topic object with creation configs
+A base class for all Serializers
-
+
-#### TopicManager.changelog\_topic
+#### Serializer.extra\_headers
```python
-def changelog_topic(topic_name: str, store_name: str,
- consumer_group: str) -> Topic
+@property
+def extra_headers() -> MessageHeadersMapping
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/manager.py#L191)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/base.py#L80)
-Performs all the logic necessary to generate a changelog topic based on a
+Informs producer to set additional headers
-"source topic" (aka input/consumed topic).
+for the message it will be serializing
-Its main goal is to ensure partition counts of the to-be generated changelog
-match the source topic, and ensure the changelog topic is compacted. Also
-enforces the serialization type. All `Topic` objects generated with this are
-stored on the TopicManager.
+Must return a dictionary with headers.
+Keys must be strings, and values must be strings, bytes or None.
-If source topic already exists, defers to the existing topic settings, else
-uses the settings as defined by the `Topic` (and its defaults) as generated
-by the `TopicManager`.
+**Returns**:
-In general, users should NOT need this; an Application knows when/how to
-generate changelog topics. To turn off changelogs, init an Application with
-"use_changelog_topics"=`False`.
+dict with headers
-**Arguments**:
+
-- `consumer_group`: name of consumer group (for this app)
-- `topic_name`: name of consumed topic (app input topic)
-> NOTE: normally contain any prefixes added by TopicManager.topic()
-- `store_name`: name of the store this changelog belongs to
-(default, rolling10s, etc.)
+## quixstreams.models.messages
-**Returns**:
+
-`Topic` object (which is also stored on the TopicManager)
+## quixstreams.models.rows
-
+
-#### TopicManager.create\_topics
+### Row
```python
-def create_topics(topics: List[Topic])
+class Row()
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/manager.py#L262)
-
-Creates topics via an explicit list of provided `Topics`.
-
-Exists as a way to manually specify what topics to create; otherwise,
-`create_all_topics()` is generally simpler.
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/rows.py#L11)
-**Arguments**:
-
-- `topics`: list of `Topic`s
+Row is a dict-like interface on top of the message data + some Kafka props
-
+
-#### TopicManager.create\_all\_topics
+#### Row.keys
```python
-def create_all_topics()
+def keys() -> KeysView
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/manager.py#L277)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/rows.py#L73)
-A convenience method to create all Topic objects stored on this TopicManager.
+Also allows unpacking row.value via **row
-
+
-#### TopicManager.validate\_all\_topics
+#### Row.clone
```python
-def validate_all_topics()
+def clone(value: dict) -> Self
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/manager.py#L283)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/rows.py#L85)
-Validates all topics exist and changelogs have correct topic and rep factor.
+Manually clone the Row; doing it this way is much faster than doing a deepcopy
+on the entire Row object.
-Issues are pooled and raised as an Exception once inspections are complete.
+
+
+## quixstreams.models.topics
@@ -3068,7 +3207,7 @@ Issues are pooled and raised as an Exception once inspections are complete.
def convert_topic_list(topics: List[Topic]) -> List[ConfluentTopic]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/admin.py#L23)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/admin.py#L23)
Converts `Topic`s to `ConfluentTopic`s as required for Confluent's
@@ -3090,7 +3229,7 @@ list of confluent_kafka `ConfluentTopic`s
class TopicAdmin()
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/admin.py#L46)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/admin.py#L46)
For performing "admin"-level operations on a Kafka cluster, mostly around topics.
@@ -3104,7 +3243,7 @@ Primarily used to create and inspect topic configurations.
def __init__(broker_address: str, extra_config: Optional[Mapping] = None)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/admin.py#L53)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/admin.py#L53)
**Arguments**:
@@ -3119,7 +3258,7 @@ def __init__(broker_address: str, extra_config: Optional[Mapping] = None)
def list_topics() -> Dict[str, ConfluentTopicMetadata]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/admin.py#L74)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/admin.py#L74)
Get a list of topics and their metadata from a Kafka cluster
@@ -3135,7 +3274,7 @@ a dict of topic names and their metadata objects
def inspect_topics(topic_names: List[str]) -> Dict[str, Optional[TopicConfig]]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/admin.py#L83)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/admin.py#L83)
A simplified way of getting the topic configurations of the provided topics
@@ -3159,7 +3298,7 @@ def create_topics(topics: List[Topic],
finalize_timeout: int = 60)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/admin.py#L156)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/admin.py#L156)
Create the given list of topics and confirm they are ready.
@@ -3172,10 +3311,6 @@ fail (it ignores issues for a topic already existing).
- `timeout`: timeout of the creation broker request
- `finalize_timeout`: the timeout of the topic finalizing ("ready")
-
-
-## quixstreams.models.topics.exceptions
-
## quixstreams.models.topics.topic
@@ -3189,7 +3324,7 @@ fail (it ignores issues for a topic already existing).
class TopicConfig()
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/topic.py#L43)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/topic.py#L43)
Represents all kafka-level configuration for a kafka topic.
@@ -3203,7 +3338,7 @@ Generally used by Topic and any topic creation procedures.
class Topic()
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/topic.py#L84)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/topic.py#L84)
A definition of a Kafka topic.
@@ -3226,7 +3361,7 @@ def __init__(
timestamp_extractor: Optional[TimestampExtractor] = None)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/topic.py#L93)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/topic.py#L93)
**Arguments**:
@@ -3248,7 +3383,7 @@ milliseconds from a deserialized message.
def name() -> str
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/topic.py#L122)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/topic.py#L122)
Topic name
@@ -3260,7 +3395,7 @@ Topic name
def row_serialize(row: Row, key: Optional[Any] = None) -> KafkaMessage
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/topic.py#L132)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/topic.py#L132)
Serialize Row to a Kafka message structure
@@ -3282,7 +3417,7 @@ def row_deserialize(
message: ConfluentKafkaMessageProto) -> Union[Row, List[Row], None]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/topic.py#L155)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/topic.py#L155)
Deserialize incoming Kafka message to a Row.
@@ -3294,1086 +3429,1016 @@ Deserialize incoming Kafka message to a Row.
Row, list of Rows or None if the message is ignored.
-
-
-## quixstreams.models.messagecontext
-
-
-
-### MessageContext
-
-```python
-class MessageContext()
-```
-
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/messagecontext.py#L7)
-
-An object with Kafka message properties.
-
-It is made pseudo-immutable (i.e. public attributes don't have setters), and
-it should not be mutated during message processing.
-
-
-
-## quixstreams.models.messages
-
-
-
-## quixstreams.models.rows
-
-
-
-### Row
-
-```python
-class Row()
-```
-
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/rows.py#L11)
-
-Row is a dict-like interface on top of the message data + some Kafka props
-
-
-
-#### Row.keys
-
-```python
-def keys() -> KeysView
-```
-
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/rows.py#L73)
-
-Also allows unpacking row.value via **row
-
-
-
-#### Row.clone
-
-```python
-def clone(value: dict) -> Self
-```
-
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/rows.py#L85)
+
-Manually clone the Row; doing it this way is much faster than doing a deepcopy
-on the entire Row object.
+## quixstreams.models.topics.exceptions
-
+
-## quixstreams.models.timestamps
+## quixstreams.models.topics.manager
-
+
-### TimestampType
+#### affirm\_ready\_for\_create
```python
-class TimestampType(enum.IntEnum)
+def affirm_ready_for_create(topics: List[Topic])
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/timestamps.py#L8)
-
-
-
-#### TIMESTAMP\_NOT\_AVAILABLE
-
-timestamps not supported by broker
-
-
-
-#### TIMESTAMP\_CREATE\_TIME
-
-message creation time (or source / producer time)
-
-
-
-#### TIMESTAMP\_LOG\_APPEND\_TIME
-
-broker receive time
-
-
-
-### MessageTimestamp
-
-```python
-class MessageTimestamp()
-```
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/manager.py#L19)
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/timestamps.py#L14)
+Validate a list of topics is ready for creation attempt
-Represents a timestamp of incoming Kafka message.
+**Arguments**:
-It is made pseudo-immutable (i.e. public attributes don't have setters), and
-it should not be mutated during message processing.
+- `topics`: list of `Topic`s
-
+
-#### MessageTimestamp.create
+### TopicManager
```python
-@classmethod
-def create(cls, timestamp_type: int, milliseconds: int) -> Self
+class TopicManager()
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/timestamps.py#L41)
-
-Create a Timestamp object based on data
-
-from `confluent_kafka.Message.timestamp()`.
-
-If timestamp type is "TIMESTAMP_NOT_AVAILABLE", the milliseconds are set to None
-
-**Arguments**:
-
-- `timestamp_type`: a timestamp type represented as a number
-Can be one of:
-- "0" - TIMESTAMP_NOT_AVAILABLE, timestamps not supported by broker.
-- "1" - TIMESTAMP_CREATE_TIME, message creation time (or source / producer time).
-- "2" - TIMESTAMP_LOG_APPEND_TIME, broker receive time.
-- `milliseconds`: the number of milliseconds since the epoch (UTC).
-
-**Returns**:
-
-Timestamp object
-
-
-
-## quixstreams.models.types
-
-
-
-### ConfluentKafkaMessageProto
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/manager.py#L29)
-```python
-class ConfluentKafkaMessageProto(Protocol)
-```
+The source of all topic management with quixstreams.
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/types.py#L12)
+Generally initialized and managed automatically by an `Application`,
+but allows a user to work with it directly when needed, such as using it alongside
+a plain `Producer` to create its topics.
-An interface of `confluent_kafka.Message`.
+See methods for details.
-Use it to not depend on exact implementation and simplify testing.
+
-Instances of `confluent_kafka.Message` cannot be directly created from Python,
-see https://github.com/confluentinc/confluent-kafka-python/issues/1535.
+#### TopicManager.\_\_init\_\_
-
+```python
+def __init__(topic_admin: TopicAdmin, create_timeout: int = 60)
+```
-## quixstreams.platforms
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/manager.py#L48)
-
+**Arguments**:
-## quixstreams.platforms.quix.checks
+- `topic_admin`: an `Admin` instance (required for some functionality)
+- `create_timeout`: timeout for topic creation
-
+
-#### check\_state\_management\_enabled
+#### TopicManager.changelog\_topics
```python
-def check_state_management_enabled()
+@property
+def changelog_topics() -> Dict[str, Dict[str, Topic]]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/checks.py#L11)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/manager.py#L71)
-Check if State Management feature is enabled for the current deployment on
-Quix platform.
-If it's disabled, the exception will be raised.
+Note: `Topic`s are the changelogs.
-
+returns: the changelog topic dict, {topic_name: {suffix: Topic}}
-#### check\_state\_dir
+
+
+#### TopicManager.topic\_config
```python
-def check_state_dir(state_dir: str)
+def topic_config(num_partitions: Optional[int] = None,
+ replication_factor: Optional[int] = None,
+ extra_config: Optional[dict] = None) -> TopicConfig
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/checks.py#L28)
-
-Check if Application "state_dir" matches the state dir on Quix platform.
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/manager.py#L119)
-If it doesn't match, the warning will be logged.
+Convenience method for generating a `TopicConfig` with default settings
**Arguments**:
-- `state_dir`: application state_dir path
+- `num_partitions`: the number of topic partitions
+- `replication_factor`: the topic replication factor
+- `extra_config`: other optional configuration settings
-
+**Returns**:
-## quixstreams.platforms.quix.config
+a TopicConfig object
-
+
-### TopicCreationConfigs
+#### TopicManager.topic
```python
-@dataclasses.dataclass
-class TopicCreationConfigs()
+def topic(name: str,
+ value_deserializer: Optional[DeserializerType] = None,
+ key_deserializer: Optional[DeserializerType] = "bytes",
+ value_serializer: Optional[SerializerType] = None,
+ key_serializer: Optional[SerializerType] = "bytes",
+ config: Optional[TopicConfig] = None,
+ timestamp_extractor: Optional[TimestampExtractor] = None) -> Topic
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/config.py#L51)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/manager.py#L140)
-
+A convenience method for generating a `Topic`. Will use default config options
-#### name
+as dictated by the TopicManager.
-Required when not created by a Quix App.
+**Arguments**:
-
+- `name`: topic name
+- `value_deserializer`: a deserializer type for values
+- `key_deserializer`: a deserializer type for keys
+- `value_serializer`: a serializer type for values
+- `key_serializer`: a serializer type for keys
+- `config`: optional topic configurations (for creation/validation)
+- `timestamp_extractor`: a callable that returns a timestamp in
+milliseconds from a deserialized message.
-#### strip\_workspace\_id\_prefix
+**Returns**:
+
+Topic object with creation configs
+
+
+
+#### TopicManager.changelog\_topic
```python
-def strip_workspace_id_prefix(workspace_id: str, s: str) -> str
+def changelog_topic(topic_name: str, store_name: str,
+ consumer_group: str) -> Topic
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/config.py#L60)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/manager.py#L189)
-Remove the workspace ID from a given string if it starts with it,
+Performs all the logic necessary to generate a changelog topic based on a
-typically a topic or consumer group id
+"source topic" (aka input/consumed topic).
+
+Its main goal is to ensure partition counts of the to-be generated changelog
+match the source topic, and ensure the changelog topic is compacted. Also
+enforces the serialization type. All `Topic` objects generated with this are
+stored on the TopicManager.
+
+If source topic already exists, defers to the existing topic settings, else
+uses the settings as defined by the `Topic` (and its defaults) as generated
+by the `TopicManager`.
+
+In general, users should NOT need this; an Application knows when/how to
+generate changelog topics. To turn off changelogs, init an Application with
+"use_changelog_topics"=`False`.
**Arguments**:
-- `workspace_id`: the workspace id
-- `s`: the string to append to
+- `consumer_group`: name of consumer group (for this app)
+- `topic_name`: name of consumed topic (app input topic)
+> NOTE: normally contain any prefixes added by TopicManager.topic()
+- `store_name`: name of the store this changelog belongs to
+(default, rolling10s, etc.)
**Returns**:
-the string with workspace_id prefix removed
+`Topic` object (which is also stored on the TopicManager)
-
+
-#### prepend\_workspace\_id
+#### TopicManager.create\_topics
```python
-def prepend_workspace_id(workspace_id: str, s: str) -> str
+def create_topics(topics: List[Topic])
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/config.py#L72)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/manager.py#L260)
-Add the workspace ID as a prefix to a given string if it does not have it,
+Creates topics via an explicit list of provided `Topics`.
-typically a topic or consumer group it
+Exists as a way to manually specify what topics to create; otherwise,
+`create_all_topics()` is generally simpler.
**Arguments**:
-- `workspace_id`: the workspace id
-- `s`: the string to append to
+- `topics`: list of `Topic`s
-**Returns**:
+
-the string with workspace_id prepended
+#### TopicManager.create\_all\_topics
-
+```python
+def create_all_topics()
+```
-### QuixKafkaConfigsBuilder
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/manager.py#L275)
+
+A convenience method to create all Topic objects stored on this TopicManager.
+
+
+
+#### TopicManager.validate\_all\_topics
```python
-class QuixKafkaConfigsBuilder()
+def validate_all_topics()
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/config.py#L84)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/manager.py#L281)
-Retrieves all the necessary information from the Quix API and builds all the
-objects required to connect a confluent-kafka client to the Quix Platform.
+Validates all topics exist and changelogs have correct topic and rep factor.
-If not executed within the Quix platform directly, you must provide a Quix
-"streaming" (aka "sdk") token, or Personal Access Token.
+Issues are pooled and raised as an Exception once inspections are complete.
-Ideally you also know your workspace name or id. If not, you can search for it
-using a known topic name, but note the search space is limited to the access level
-of your token.
+
-It also currently handles the app_auto_create_topics setting for Application.Quix.
+## quixstreams.state.rocksdb.windowed.store
-
+
-#### QuixKafkaConfigsBuilder.\_\_init\_\_
+### WindowedRocksDBStore
```python
-def __init__(quix_sdk_token: Optional[str] = None,
- workspace_id: Optional[str] = None,
- workspace_cert_path: Optional[str] = None,
- quix_portal_api_service: Optional[QuixPortalApiService] = None)
+class WindowedRocksDBStore(RocksDBStore)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/config.py#L100)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/windowed/store.py#L10)
-**Arguments**:
+RocksDB-based windowed state store.
-- `quix_portal_api_service`: A QuixPortalApiService instance (else generated)
-- `workspace_id`: A valid Quix Workspace ID (else searched for)
-- `workspace_cert_path`: path to an existing workspace cert (else retrieved)
+It keeps track of individual store partitions and provides access to the
+partitions' transactions.
-
+
-#### QuixKafkaConfigsBuilder.strip\_workspace\_id\_prefix
+#### WindowedRocksDBStore.\_\_init\_\_
```python
-def strip_workspace_id_prefix(s: str) -> str
+def __init__(
+ name: str,
+ topic: str,
+ base_dir: str,
+ changelog_producer_factory: Optional[ChangelogProducerFactory] = None,
+ options: Optional[RocksDBOptionsType] = None)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/config.py#L184)
-
-Remove the workspace ID from a given string if it starts with it,
-
-typically a topic or consumer group id
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/windowed/store.py#L18)
**Arguments**:
-- `s`: the string to append to
+- `name`: a unique store name
+- `topic`: a topic name for this store
+- `base_dir`: path to a directory with the state
+- `changelog_producer_factory`: a ChangelogProducerFactory instance
+if using changelogs
+- `options`: RocksDB options. If `None`, the default options will be used.
-**Returns**:
+
-the string with workspace_id prefix removed
+## quixstreams.state.rocksdb.windowed.partition
-
+
-#### QuixKafkaConfigsBuilder.prepend\_workspace\_id
+### WindowedRocksDBStorePartition
```python
-def prepend_workspace_id(s: str) -> str
+class WindowedRocksDBStorePartition(RocksDBStorePartition)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/config.py#L194)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/windowed/partition.py#L24)
-Add the workspace ID as a prefix to a given string if it does not have it,
+A base class to access windowed state in RocksDB.
-typically a topic or consumer group it
+It represents a single RocksDB database.
+
+Besides the data, it keeps track of the latest observed timestamp and
+stores the expiration index to delete expired windows.
**Arguments**:
-- `s`: the string to append to
+- `path`: an absolute path to the RocksDB folder
+- `options`: RocksDB options. If `None`, the default options will be used.
-**Returns**:
+
-the string with workspace_id prepended
+## quixstreams.state.rocksdb.windowed.metadata
-
+
-#### QuixKafkaConfigsBuilder.search\_for\_workspace
+## quixstreams.state.rocksdb.windowed.transaction
+
+
+
+### WindowedRocksDBPartitionTransaction
```python
-def search_for_workspace(
- workspace_name_or_id: Optional[str] = None) -> Optional[dict]
+class WindowedRocksDBPartitionTransaction(RocksDBPartitionTransaction)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/config.py#L204)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/windowed/transaction.py#L22)
-Search for a workspace given an expected workspace name or id.
+
-**Arguments**:
+#### WindowedRocksDBPartitionTransaction.expire\_windows
-- `workspace_name_or_id`: the expected name or id of a workspace
+```python
+def expire_windows(duration_ms: int,
+ prefix: bytes,
+ grace_ms: int = 0) -> List[Tuple[Tuple[int, int], Any]]
+```
-**Returns**:
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/windowed/transaction.py#L105)
-the workspace data dict if search success, else None
+Get a list of expired windows from RocksDB considering latest timestamp,
-
+window size and grace period.
+It marks the latest found window as expired in the expiration index, so
+calling this method multiple times will yield different results for the same
+"latest timestamp".
-#### QuixKafkaConfigsBuilder.get\_workspace\_info
+How it works:
+- First, it looks for the start time of the last expired window for the current
+ prefix using expiration cache. If it's found, it will be used to reduce
+ the search space and to avoid returning already expired windows.
+- Then it goes over window segments and fetches the windows
+ that should be expired.
+- At last, it updates the expiration cache with the start time of the latest
+ found windows
-```python
-def get_workspace_info(known_workspace_topic: Optional[str] = None)
-```
+**Returns**:
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/config.py#L227)
+sorted list of tuples in format `((start, end), value)`
-Queries for workspace data from the Quix API, regardless of instance cache,
+
-and updates instance attributes from query result.
+## quixstreams.state.rocksdb.windowed
-**Arguments**:
+
-- `known_workspace_topic`: a topic you know to exist in some workspace
+## quixstreams.state.rocksdb.windowed.serialization
-
+
-#### QuixKafkaConfigsBuilder.search\_workspace\_for\_topic
+#### parse\_window\_key
```python
-def search_workspace_for_topic(workspace_id: str, topic: str) -> Optional[str]
+def parse_window_key(key: bytes) -> Tuple[bytes, int, int]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/config.py#L254)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/windowed/serialization.py#L12)
-Search through all the topics in the given workspace id to see if there is a
+Parse the window key from Rocksdb into (message_key, start, end) structure.
-match with the provided topic.
+Expected window key format:
+||
**Arguments**:
-- `workspace_id`: the workspace to search in
-- `topic`: the topic to search for
+- `key`: a key from Rocksdb
**Returns**:
-the workspace_id if success, else None
+a tuple with message key, start timestamp, end timestamp
-
+
-#### QuixKafkaConfigsBuilder.search\_for\_topic\_workspace
+#### encode\_window\_key
```python
-def search_for_topic_workspace(topic: str) -> Optional[dict]
+def encode_window_key(start_ms: int, end_ms: int) -> bytes
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/config.py#L270)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/windowed/serialization.py#L39)
-Find what workspace a topic belongs to.
+Encode window start and end timestamps into bytes of the following format:
-If there is only one workspace altogether, it is assumed to be the workspace.
-More than one means each workspace will be searched until the first hit.
+```|```
+
+Encoding window keys this way make them sortable in RocksDB within the same prefix.
**Arguments**:
-- `topic`: the topic to search for
+- `start_ms`: window start in milliseconds
+- `end_ms`: window end in milliseconds
**Returns**:
-workspace data dict if topic search success, else None
+window timestamps as bytes
-
+
-#### QuixKafkaConfigsBuilder.get\_workspace\_ssl\_cert
+#### encode\_window\_prefix
```python
-def get_workspace_ssl_cert(
- extract_to_folder: Optional[Path] = None) -> Optional[str]
+def encode_window_prefix(prefix: bytes, start_ms: int) -> bytes
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/config.py#L291)
-
-Gets and extracts zipped certificate from the API to provided folder if the
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/windowed/serialization.py#L53)
-SSL certificate is specified in broker configuration.
+Encode window prefix and start time to iterate over keys in RocksDB
-If no path was provided, will dump to /tmp. Expects cert named 'ca.cert'.
+Format:
+```|```
**Arguments**:
-- `extract_to_folder`: path to folder to dump zipped cert file to
+- `prefix`: transaction prefix
+- `start_ms`: window start time in milliseconds
**Returns**:
-full cert filepath as string or `None` if certificate is not specified
-
-
+bytes
-#### QuixKafkaConfigsBuilder.create\_topics
+
-```python
-def create_topics(topics: List[Topic],
- finalize_timeout_seconds: Optional[int] = None)
-```
+## quixstreams.state.rocksdb.windowed.state
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/config.py#L368)
+
-Create topics in a Quix cluster.
+### WindowedTransactionState
-**Arguments**:
+```python
+class WindowedTransactionState(WindowedState)
+```
-- `topics`: a list of `Topic` objects
-- `finalize_timeout_seconds`: How long to wait for the topics to be
-marked as "Ready" (and thus ready to produce to/consume from).
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/windowed/state.py#L9)
-
+
-#### QuixKafkaConfigsBuilder.confirm\_topics\_exist
+#### WindowedTransactionState.\_\_init\_\_
```python
-def confirm_topics_exist(topics: Union[List[Topic], List[str]])
+def __init__(transaction: "WindowedRocksDBPartitionTransaction",
+ prefix: bytes)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/config.py#L417)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/windowed/state.py#L12)
-Confirm whether the desired set of topics exists in the Quix workspace.
+A windowed state to be provided into `StreamingDataFrame` window functions.
**Arguments**:
-- `topics`: a list of `Topic` or topic names
+- `transaction`: instance of `WindowedRocksDBPartitionTransaction`
-
+
-#### QuixKafkaConfigsBuilder.get\_confluent\_broker\_config
+#### WindowedTransactionState.get\_window
```python
-def get_confluent_broker_config(known_topic: Optional[str] = None) -> dict
+def get_window(start_ms: int,
+ end_ms: int,
+ default: Any = None) -> Optional[Any]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/config.py#L457)
-
-Get the full client config dictionary required to authenticate a confluent-kafka
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/windowed/state.py#L23)
-client to a Quix platform broker/workspace.
+Get the value of the window defined by `start` and `end` timestamps
-The returned config can be used directly by any confluent-kafka-python consumer/
-producer (add your producer/consumer-specific configs afterward).
+if the window is present in the state, else default
**Arguments**:
-- `known_topic`: a topic known to exist in some workspace
+- `start_ms`: start of the window in milliseconds
+- `end_ms`: end of the window in milliseconds
+- `default`: default value to return if the key is not found
**Returns**:
-a dict of confluent-kafka-python client settings (see librdkafka
-config for more details)
+value or None if the key is not found and `default` is not provided
-
+
-#### QuixKafkaConfigsBuilder.get\_confluent\_client\_configs
+#### WindowedTransactionState.update\_window
```python
-def get_confluent_client_configs(
- topics: list,
- consumer_group_id: Optional[str] = None
-) -> Tuple[dict, List[str], Optional[str]]
+def update_window(start_ms: int, end_ms: int, value: Any, timestamp_ms: int)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/config.py#L502)
-
-Get all the values you need in order to use a confluent_kafka-based client
-
-with a topic on a Quix platform broker/workspace.
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/windowed/state.py#L39)
-The returned config can be used directly by any confluent-kafka-python consumer/
-producer (add your producer/consumer-specific configs afterward).
+Set a value for the window.
-The topics and consumer group are appended with any necessary values.
+This method will also update the latest observed timestamp in state partition
+using the provided `timestamp`.
**Arguments**:
-- `topics`: list of topics
-- `consumer_group_id`: consumer group id, if needed
-
-**Returns**:
+- `start_ms`: start of the window in milliseconds
+- `end_ms`: end of the window in milliseconds
+- `value`: value of the window
+- `timestamp_ms`: current message timestamp in milliseconds
-a tuple with configs and altered versions of the topics
-and consumer group name
+
-
+#### WindowedTransactionState.get\_latest\_timestamp
-## quixstreams.platforms.quix.env
+```python
+def get_latest_timestamp() -> int
+```
-
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/windowed/state.py#L60)
-### QuixEnvironment
+Get the latest observed timestamp for the current state partition.
-```python
-class QuixEnvironment()
-```
+Use this timestamp to determine if the arriving event is late and should be
+discarded from the processing.
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/env.py#L7)
+**Returns**:
-Class to access various Quix platform environment settings
+latest observed event timestamp in milliseconds
-
+
-#### QuixEnvironment.state\_management\_enabled
+#### WindowedTransactionState.expire\_windows
```python
-@property
-def state_management_enabled() -> bool
+def expire_windows(duration_ms: int,
+ grace_ms: int = 0) -> List[Tuple[Tuple[int, int], Any]]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/env.py#L19)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/windowed/state.py#L72)
-Check whether "State management" is enabled for the current deployment
+Get a list of expired windows from RocksDB considering the current
+latest timestamp, window duration and grace period.
-**Returns**:
+It also marks the latest found window as expired in the expiration index, so
+calling this method multiple times will yield different results for the same
+"latest timestamp".
-True if state management is enabled, otherwise False
+
-
+## quixstreams.state.rocksdb.options
-#### QuixEnvironment.deployment\_id
+
+
+### RocksDBOptions
```python
-@property
-def deployment_id() -> Optional[str]
+@dataclasses.dataclass(frozen=True)
+class RocksDBOptions(RocksDBOptionsType)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/env.py#L27)
-
-Return current Quix deployment id.
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/options.py#L25)
-This variable is meant to be set only by Quix Platform and only
-when the application is deployed.
+RocksDB database options.
-**Returns**:
+**Arguments**:
-deployment id or None
+- `dumps`: function to dump data to JSON
+- `loads`: function to load data from JSON
+- `open_max_retries`: number of times to retry opening the database
+if it's locked by another process. To disable retrying, pass 0
+- `open_retry_backoff`: number of seconds to wait between each retry.
+Please see `rocksdict.Options` for a complete description of other options.
-
+
-#### QuixEnvironment.workspace\_id
+#### RocksDBOptions.to\_options
```python
-@property
-def workspace_id() -> Optional[str]
+def to_options() -> rocksdict.Options
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/env.py#L39)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/options.py#L53)
-Return Quix workspace id if set
+Convert parameters to `rocksdict.Options`
**Returns**:
-workspace id or None
+instance of `rocksdict.Options`
-
+
-#### QuixEnvironment.portal\_api
+## quixstreams.state.rocksdb.store
+
+
+
+### RocksDBStore
```python
-@property
-def portal_api() -> Optional[str]
+class RocksDBStore(Store)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/env.py#L47)
-
-Return Quix Portal API url if set
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/store.py#L19)
-**Returns**:
+RocksDB-based state store.
-portal API URL or None
+It keeps track of individual store partitions and provides access to the
+partitions' transactions.
-
+
-#### QuixEnvironment.state\_dir
+#### RocksDBStore.\_\_init\_\_
```python
-@property
-def state_dir() -> str
+def __init__(
+ name: str,
+ topic: str,
+ base_dir: str,
+ changelog_producer_factory: Optional[ChangelogProducerFactory] = None,
+ options: Optional[options_type] = None)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/env.py#L56)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/store.py#L29)
-Return application state directory on Quix.
+**Arguments**:
-**Returns**:
+- `name`: a unique store name
+- `topic`: a topic name for this store
+- `base_dir`: path to a directory with the state
+- `changelog_producer_factory`: a ChangelogProducerFactory instance
+if using changelogs
+- `options`: RocksDB options. If `None`, the default options will be used.
-path to state dir
+
+
+#### RocksDBStore.topic
+
+```python
+@property
+def topic() -> str
+```
-
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/store.py#L53)
-## quixstreams.platforms.quix.topic\_manager
+Store topic name
-
+
-### QuixTopicManager
+#### RocksDBStore.name
```python
-class QuixTopicManager(TopicManager)
+@property
+def name() -> str
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/topic_manager.py#L9)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/store.py#L60)
-The source of all topic management with quixstreams.
+Store name
-This is specifically for Applications using the Quix platform.
+
-Generally initialized and managed automatically by an `Application.Quix`,
-but allows a user to work with it directly when needed, such as using it alongside
-a plain `Producer` to create its topics.
+#### RocksDBStore.partitions
-See methods for details.
+```python
+@property
+def partitions() -> Dict[int, RocksDBStorePartition]
+```
-
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/store.py#L67)
-#### QuixTopicManager.\_\_init\_\_
+Mapping of assigned store partitions
+
+
+
+#### RocksDBStore.assign\_partition
```python
-def __init__(topic_admin: TopicAdmin,
- quix_config_builder: QuixKafkaConfigsBuilder,
- create_timeout: int = 60)
+def assign_partition(partition: int) -> RocksDBStorePartition
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/topic_manager.py#L30)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/store.py#L80)
-**Arguments**:
+Open and assign store partition.
-- `topic_admin`: an `Admin` instance
-- `create_timeout`: timeout for topic creation
-- `quix_config_builder`: A QuixKafkaConfigsBuilder instance, else one is
-generated for you.
+If the partition is already assigned, it will not re-open it and return
+the existing partition instead.
-
+**Arguments**:
-## quixstreams.platforms.quix
+- `partition`: partition number
-
+**Returns**:
-## quixstreams.platforms.quix.api
+instance of`RocksDBStorePartition`
-
+
-### QuixPortalApiService
+#### RocksDBStore.revoke\_partition
```python
-class QuixPortalApiService()
+def revoke_partition(partition: int)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/api.py#L19)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/store.py#L117)
-A light wrapper around the Quix Portal Api. If used in the Quix Platform, it will
-use that workspaces auth token and portal endpoint, else you must provide it.
+Revoke and close the assigned store partition.
-Function names closely reflect the respective API endpoint,
-each starting with the method [GET, POST, etc.] followed by the endpoint path.
+If the partition is not assigned, it will log the message and return.
-Results will be returned in the form of request's Response.json(), unless something
-else is required. Non-200's will raise exceptions.
+**Arguments**:
-See the swagger documentation for more info about the endpoints.
+- `partition`: partition number
-
+
-#### QuixPortalApiService.get\_workspace\_certificate
+#### RocksDBStore.start\_partition\_transaction
```python
-def get_workspace_certificate(
- workspace_id: Optional[str] = None) -> Optional[bytes]
+def start_partition_transaction(partition: int) -> RocksDBPartitionTransaction
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/api.py#L114)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/store.py#L138)
-Get a workspace TLS certificate if available.
+Start a new partition transaction.
-Returns `None` if certificate is not specified.
+`RocksDBPartitionTransaction` is the primary interface for working with data in
+the underlying RocksDB.
**Arguments**:
-- `workspace_id`: workspace id, optional
+- `partition`: partition number
**Returns**:
-certificate as bytes if present, or None
-
-
+instance of `RocksDBPartitionTransaction`
-## quixstreams.platforms.quix.exceptions
+
-
+#### RocksDBStore.close
-## quixstreams.state.rocksdb.serialization
+```python
+def close()
+```
-
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/store.py#L160)
-## quixstreams.state.rocksdb.windowed
+Close the store and revoke all assigned partitions
-
+
-## quixstreams.state.rocksdb.windowed.serialization
+## quixstreams.state.rocksdb.partition
-
+
-#### parse\_window\_key
+### RocksDBStorePartition
```python
-def parse_window_key(key: bytes) -> Tuple[bytes, int, int]
+class RocksDBStorePartition(StorePartition)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/windowed/serialization.py#L12)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/partition.py#L40)
-Parse the window key from Rocksdb into (message_key, start, end) structure.
+A base class to access state in RocksDB.
-Expected window key format:
-||
+It represents a single RocksDB database.
-**Arguments**:
+Responsibilities:
+ 1. Managing access to the RocksDB instance
+ 2. Creating transactions to interact with data
+ 3. Flushing WriteBatches to the RocksDB
-- `key`: a key from Rocksdb
+It opens the RocksDB on `__init__`. If the db is locked by another process,
+it will retry according to `open_max_retries` and `open_retry_backoff` options.
-**Returns**:
+**Arguments**:
-a tuple with message key, start timestamp, end timestamp
+- `path`: an absolute path to the RocksDB folder
+- `options`: RocksDB options. If `None`, the default options will be used.
-
+
-#### encode\_window\_key
+#### RocksDBStorePartition.begin
```python
-def encode_window_key(start_ms: int, end_ms: int) -> bytes
+def begin() -> RocksDBPartitionTransaction
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/windowed/serialization.py#L39)
-
-Encode window start and end timestamps into bytes of the following format:
-
-```|```
-
-Encoding window keys this way make them sortable in RocksDB within the same prefix.
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/partition.py#L75)
-**Arguments**:
+Create a new `RocksDBTransaction` object.
-- `start_ms`: window start in milliseconds
-- `end_ms`: window end in milliseconds
+Using `RocksDBTransaction` is a recommended way for accessing the data.
**Returns**:
-window timestamps as bytes
+an instance of `RocksDBTransaction`
-
+
-#### encode\_window\_prefix
+#### RocksDBStorePartition.recover\_from\_changelog\_message
```python
-def encode_window_prefix(prefix: bytes, start_ms: int) -> bytes
+def recover_from_changelog_message(
+ changelog_message: ConfluentKafkaMessageProto, committed_offset: int)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/windowed/serialization.py#L53)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/partition.py#L128)
-Encode window prefix and start time to iterate over keys in RocksDB
+Updates state from a given changelog message.
-Format:
-```|```
+The actual update may be skipped when both conditions are met:
+
+- The changelog message has headers with the processed message offset.
+- This processed offset is larger than the latest committed offset for the same
+ topic partition.
+
+This way the state does not apply the state changes for not-yet-committed
+messages and improves the state consistency guarantees.
**Arguments**:
-- `prefix`: transaction prefix
-- `start_ms`: window start time in milliseconds
+- `changelog_message`: A raw Confluent message read from a changelog topic.
+- `committed_offset`: latest committed offset for the partition
-**Returns**:
+
-bytes
+#### RocksDBStorePartition.set\_changelog\_offset
-
+```python
+def set_changelog_offset(changelog_offset: int)
+```
-## quixstreams.state.rocksdb.windowed.state
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/partition.py#L169)
-
+Set the changelog offset based on a message (usually an "offset-only" message).
-### WindowedTransactionState
+Used during recovery.
-```python
-class WindowedTransactionState(WindowedState)
-```
+**Arguments**:
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/windowed/state.py#L9)
+- `changelog_offset`: A changelog offset
-
+
-#### WindowedTransactionState.\_\_init\_\_
+#### RocksDBStorePartition.write
```python
-def __init__(transaction: "WindowedRocksDBPartitionTransaction")
+def write(batch: WriteBatch)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/windowed/state.py#L12)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/partition.py#L179)
-A windowed state to be provided into `StreamingDataFrame` window functions.
+Write `WriteBatch` to RocksDB
**Arguments**:
-- `transaction`: instance of `WindowedRocksDBPartitionTransaction`
+- `batch`: an instance of `rocksdict.WriteBatch`
-
+
-#### WindowedTransactionState.get\_window
+#### RocksDBStorePartition.get
```python
-def get_window(start_ms: int,
- end_ms: int,
- default: Any = None) -> Optional[Any]
+def get(key: bytes,
+ default: Any = None,
+ cf_name: str = "default") -> Union[None, bytes, Any]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/windowed/state.py#L20)
-
-Get the value of the window defined by `start` and `end` timestamps
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/partition.py#L186)
-if the window is present in the state, else default
+Get a key from RocksDB.
**Arguments**:
-- `start_ms`: start of the window in milliseconds
-- `end_ms`: end of the window in milliseconds
-- `default`: default value to return if the key is not found
+- `key`: a key encoded to `bytes`
+- `default`: a default value to return if the key is not found.
+- `cf_name`: rocksdb column family name. Default - "default"
**Returns**:
-value or None if the key is not found and `default` is not provided
+a value if the key is present in the DB. Otherwise, `default`
-
+
-#### WindowedTransactionState.update\_window
+#### RocksDBStorePartition.exists
```python
-def update_window(start_ms: int, end_ms: int, value: Any, timestamp_ms: int)
+def exists(key: bytes, cf_name: str = "default") -> bool
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/windowed/state.py#L36)
-
-Set a value for the window.
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/partition.py#L200)
-This method will also update the latest observed timestamp in state partition
-using the provided `timestamp`.
+Check if a key is present in the DB.
**Arguments**:
-- `start_ms`: start of the window in milliseconds
-- `end_ms`: end of the window in milliseconds
-- `value`: value of the window
-- `timestamp_ms`: current message timestamp in milliseconds
-
-
-
-#### WindowedTransactionState.get\_latest\_timestamp
-
-```python
-def get_latest_timestamp() -> int
-```
-
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/windowed/state.py#L53)
-
-Get the latest observed timestamp for the current state partition.
-
-Use this timestamp to determine if the arriving event is late and should be
-discarded from the processing.
+- `key`: a key encoded to `bytes`.
+- `cf_name`: rocksdb column family name. Default - "default"
**Returns**:
-latest observed event timestamp in milliseconds
+`True` if the key is present, `False` otherwise.
-
+
-#### WindowedTransactionState.expire\_windows
+#### RocksDBStorePartition.get\_processed\_offset
```python
-def expire_windows(duration_ms: int,
- grace_ms: int = 0) -> List[Tuple[Tuple[int, int], Any]]
-```
-
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/windowed/state.py#L65)
-
-Get a list of expired windows from RocksDB considering the current
-latest timestamp, window duration and grace period.
-
-It also marks the latest found window as expired in the expiration index, so
-calling this method multiple times will yield different results for the same
-"latest timestamp".
+def get_processed_offset() -> Optional[int]
+```
-
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/partition.py#L211)
-## quixstreams.state.rocksdb.windowed.metadata
+Get last processed offset for the given partition
-
+**Returns**:
-## quixstreams.state.rocksdb.windowed.partition
+offset or `None` if there's no processed offset yet
-
+
-### WindowedRocksDBStorePartition
+#### RocksDBStorePartition.get\_changelog\_offset
```python
-class WindowedRocksDBStorePartition(RocksDBStorePartition)
+def get_changelog_offset() -> Optional[int]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/windowed/partition.py#L24)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/partition.py#L223)
-A base class to access windowed state in RocksDB.
+Get offset that the changelog is up-to-date with.
-It represents a single RocksDB database.
+**Returns**:
-Besides the data, it keeps track of the latest observed timestamp and
-stores the expiration index to delete expired windows.
+offset or `None` if there's no processed offset yet
-**Arguments**:
+
-- `path`: an absolute path to the RocksDB folder
-- `options`: RocksDB options. If `None`, the default options will be used.
+#### RocksDBStorePartition.close
-
+```python
+def close()
+```
-## quixstreams.state.rocksdb.windowed.store
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/partition.py#L233)
-
+Close the underlying RocksDB
-### WindowedRocksDBStore
+
+
+#### RocksDBStorePartition.path
```python
-class WindowedRocksDBStore(RocksDBStore)
+@property
+def path() -> str
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/windowed/store.py#L10)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/partition.py#L246)
-RocksDB-based windowed state store.
+Absolute path to RocksDB database folder
-It keeps track of individual store partitions and provides access to the
-partitions' transactions.
+**Returns**:
-
+file path
-#### WindowedRocksDBStore.\_\_init\_\_
+
+
+#### RocksDBStorePartition.destroy
```python
-def __init__(
- name: str,
- topic: str,
- base_dir: str,
- changelog_producer_factory: Optional[ChangelogProducerFactory] = None,
- options: Optional[RocksDBOptionsType] = None)
+@classmethod
+def destroy(cls, path: str)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/windowed/store.py#L18)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/partition.py#L254)
-**Arguments**:
+Delete underlying RocksDB database
-- `name`: a unique store name
-- `topic`: a topic name for this store
-- `base_dir`: path to a directory with the state
-- `changelog_producer_factory`: a ChangelogProducerFactory instance
-if using changelogs
-- `options`: RocksDB options. If `None`, the default options will be used.
+The database must be closed first.
-
+**Arguments**:
-## quixstreams.state.rocksdb.windowed.transaction
+- `path`: an absolute path to the RocksDB folder
-
+
-### WindowedRocksDBPartitionTransaction
+#### RocksDBStorePartition.get\_column\_family\_handle
```python
-class WindowedRocksDBPartitionTransaction(RocksDBPartitionTransaction)
+def get_column_family_handle(cf_name: str) -> ColumnFamily
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/windowed/transaction.py#L21)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/partition.py#L264)
-
+Get a column family handle to pass to it WriteBatch.
-#### WindowedRocksDBPartitionTransaction.expire\_windows
+This method will cache the CF handle instance to avoid creating them
+repeatedly.
+
+**Arguments**:
+
+- `cf_name`: column family name
+
+**Returns**:
+
+instance of `rocksdict.ColumnFamily`
+
+
+
+#### RocksDBStorePartition.get\_column\_family
```python
-def expire_windows(duration_ms: int,
- grace_ms: int = 0) -> List[Tuple[Tuple[int, int], Any]]
+def get_column_family(cf_name: str) -> Rdict
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/windowed/transaction.py#L79)
-
-Get a list of expired windows from RocksDB considering latest timestamp,
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/partition.py#L285)
-window size and grace period.
-It marks the latest found window as expired in the expiration index, so
-calling this method multiple times will yield different results for the same
-"latest timestamp".
+Get a column family instance.
-How it works:
-- First, it looks for the start time of the last expired window for the current
- prefix using expiration cache. If it's found, it will be used to reduce
- the search space and to avoid returning already expired windows.
-- Then it goes over window segments and fetches the windows
- that should be expired.
-- At last, it updates the expiration cache with the start time of the latest
- found windows
+This method will cache the CF instance to avoid creating them repeatedly.
-**Returns**:
+**Arguments**:
-sorted list of tuples in format `((start, end), value)`
+- `cf_name`: column family name
-
+**Returns**:
-## quixstreams.state.rocksdb
+instance of `rocksdict.Rdict` for the given column family
@@ -4391,7 +4456,7 @@ sorted list of tuples in format `((start, end), value)`
class RocksDBPartitionTransaction(PartitionTransaction)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/transaction.py#L71)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/transaction.py#L61)
A transaction class to perform simple key-value operations like
"get", "set", "delete" and "exists" on a single RocksDB partition.
@@ -4402,10 +4467,9 @@ Serialization
Prefixing
*********
-`RocksDBTransaction` allows to set prefixes for the keys in the given code block
-using :meth:`with_prefix()` context manager.
-Normally, `StreamingDataFrame` class will use message keys as prefixes
-in order to namespace the stored keys across different messages.
+Methods `get()`, `set()`, `delete()` and `exists()` methods require prefixes for
+the keys.
+Normally, the Kafka message keys are supposed to be used as prefixes.
Transactional properties
************************
@@ -4415,7 +4479,7 @@ in a single batch, flush them atomically, and allow the updates be visible
within the transaction before it's flushed (aka "read-your-own-writes" problem).
If any mutation fails during the transaction
-(e.g. we failed to write the updates to the RocksDB), the whole transaction
+(e.g., failed to write the updates to the RocksDB), the whole transaction
will be marked as failed and cannot be used anymore.
In this case, a new `RocksDBTransaction` should be created.
@@ -4426,11 +4490,13 @@ In this case, a new `RocksDBTransaction` should be created.
#### RocksDBPartitionTransaction.\_\_init\_\_
```python
-def __init__(partition: "RocksDBStorePartition", dumps: DumpsFunc,
- loads: LoadsFunc)
+def __init__(partition: "RocksDBStorePartition",
+ dumps: DumpsFunc,
+ loads: LoadsFunc,
+ changelog_producer: Optional[ChangelogProducer] = None)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/transaction.py#L114)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/transaction.py#L100)
**Arguments**:
@@ -4439,46 +4505,19 @@ the underlying RocksDB
- `dumps`: a function to serialize data to bytes.
- `loads`: a function to deserialize data from bytes.
-
-
-#### RocksDBPartitionTransaction.with\_prefix
-
-```python
-@contextlib.contextmanager
-def with_prefix(prefix: Any = b"") -> Self
-```
-
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/transaction.py#L141)
-
-A context manager set the prefix for all keys in the scope.
-
-Normally, it's called by Streaming DataFrames engine to ensure that every
-message key is stored separately.
-
-The `with_prefix` calls should not be nested.
-Only one prefix can be set at a time.
-
-**Arguments**:
-
-- `prefix`: a prefix string to be used.
-Should be either `bytes` or object serializable to `bytes`
-by `dumps` function.
-The prefix doesn't need to contain the separator, it will be added
-automatically between the key and the prefix if the prefix
-is not empty.
-
#### RocksDBPartitionTransaction.get
```python
-@_validate_transaction_state
+@_validate_transaction_status(PartitionTransactionStatus.STARTED)
def get(key: Any,
+ prefix: bytes,
default: Any = None,
cf_name: str = "default") -> Optional[Any]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/transaction.py#L170)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/transaction.py#L124)
Get a key from the store.
@@ -4490,6 +4529,7 @@ It returns `None` if the key is not found and `default` is not provided.
**Arguments**:
- `key`: a key to get from DB
+- `prefix`: a key prefix
- `default`: value to return if the key is not present in the state.
It can be of any type.
- `cf_name`: rocksdb column family name. Default - "default"
@@ -4503,11 +4543,11 @@ value or `default`
#### RocksDBPartitionTransaction.set
```python
-@_validate_transaction_state
-def set(key: Any, value: Any, cf_name: str = "default")
+@_validate_transaction_status(PartitionTransactionStatus.STARTED)
+def set(key: Any, value: Any, prefix: bytes, cf_name: str = "default")
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/transaction.py#L205)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/transaction.py#L164)
Set a key to the store.
@@ -4516,6 +4556,7 @@ It first updates the key in the update cache.
**Arguments**:
- `key`: key to store in DB
+- `prefix`: a key prefix
- `value`: value to store in DB
- `cf_name`: rocksdb column family name. Default - "default"
@@ -4524,11 +4565,11 @@ It first updates the key in the update cache.
#### RocksDBPartitionTransaction.delete
```python
-@_validate_transaction_state
-def delete(key: Any, cf_name: str = "default")
+@_validate_transaction_status(PartitionTransactionStatus.STARTED)
+def delete(key: Any, prefix: bytes, cf_name: str = "default")
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/transaction.py#L230)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/transaction.py#L187)
Delete a key from the store.
@@ -4536,7 +4577,8 @@ It first deletes the key from the update cache.
**Arguments**:
-- `key`: key to delete from DB
+- `key`: a key to delete from DB
+- `prefix`: a key prefix
- `cf_name`: rocksdb column family name. Default - "default"
@@ -4544,11 +4586,11 @@ It first deletes the key from the update cache.
#### RocksDBPartitionTransaction.exists
```python
-@_validate_transaction_state
-def exists(key: Any, cf_name: str = "default") -> bool
+@_validate_transaction_status(PartitionTransactionStatus.STARTED)
+def exists(key: Any, prefix: bytes, cf_name: str = "default") -> bool
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/transaction.py#L253)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/transaction.py#L208)
Check if a key exists in the store.
@@ -4557,12 +4599,70 @@ It first looks up the key in the update cache.
**Arguments**:
- `key`: a key to check in DB
+- `prefix`: a key prefix
- `cf_name`: rocksdb column family name. Default - "default"
**Returns**:
`True` if the key exists, `False` otherwise.
+
+
+#### RocksDBPartitionTransaction.prepare
+
+```python
+@_validate_transaction_status(PartitionTransactionStatus.STARTED)
+def prepare(processed_offset: int)
+```
+
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/transaction.py#L235)
+
+Produce changelog messages to the changelog topic for all changes accumulated
+
+in this transaction and prepare transaction to flush its state to the state
+store.
+
+After successful `prepare()`, the transaction status is changed to PREPARED,
+and it cannot receive updates anymore.
+
+If changelog is disabled for this application, no updates will be produced
+to the changelog topic.
+
+**Arguments**:
+
+- `processed_offset`: the offset of the latest processed message
+
+
+
+#### RocksDBPartitionTransaction.flush
+
+```python
+@_validate_transaction_status(PartitionTransactionStatus.STARTED,
+ PartitionTransactionStatus.PREPARED)
+def flush(processed_offset: Optional[int] = None,
+ changelog_offset: Optional[int] = None)
+```
+
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/transaction.py#L259)
+
+Flush the recent updates to the database.
+
+It writes the WriteBatch to RocksDB and marks itself as finished.
+
+If writing fails, the transaction is marked as failed and
+cannot be used anymore.
+
+>***NOTE:*** If no keys have been modified during the transaction
+ (i.e. no "set" or "delete" have been called at least once), it will
+ not flush ANY data to the database including the offset to optimize
+ I/O.
+
+**Arguments**:
+
+- `processed_offset`: offset of the last processed message, optional.
+- `changelog_offset`: offset of the last produced changelog message,
+optional.
+
#### RocksDBPartitionTransaction.completed
@@ -4572,7 +4672,7 @@ It first looks up the key in the update cache.
def completed() -> bool
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/transaction.py#L275)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/transaction.py#L294)
Check if the transaction is completed.
@@ -4585,6 +4685,26 @@ The completed transaction should not be re-used.
`True` if transaction is completed, `False` otherwise.
+
+
+#### RocksDBPartitionTransaction.prepared
+
+```python
+@property
+def prepared() -> bool
+```
+
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/transaction.py#L308)
+
+Check if the transaction is in PREPARED status.
+
+Prepared transaction successfully flushed its changelog and cannot receive
+updates anymore, but its state is not yet flushed to the disk
+
+**Returns**:
+
+`True` if transaction is prepared, `False` otherwise.
+
#### RocksDBPartitionTransaction.failed
@@ -4594,7 +4714,7 @@ The completed transaction should not be re-used.
def failed() -> bool
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/transaction.py#L289)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/transaction.py#L320)
Check if the transaction has failed.
@@ -4605,414 +4725,482 @@ and
`True` if transaction is failed, `False` otherwise.
-
+
-#### RocksDBPartitionTransaction.maybe\_flush
+#### RocksDBPartitionTransaction.changelog\_topic\_partition
```python
-@_validate_transaction_state
-def maybe_flush(offset: Optional[int] = None)
+@property
+def changelog_topic_partition() -> Optional[Tuple[str, int]]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/transaction.py#L318)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/transaction.py#L332)
-Flush the recent updates to the database and empty the update cache.
+Return the changelog topic-partition for the StorePartition of this transaction.
-It writes the WriteBatch to RocksDB and marks itself as finished.
+Returns `None` if changelog_producer is not provided.
-If writing fails, the transaction will be also marked as "failed" and
-cannot be used anymore.
+**Returns**:
->***NOTE:*** If no keys have been modified during the transaction
- (i.e. no "set" or "delete" have been called at least once), it will
- not flush ANY data to the database including the offset in order to optimize
- I/O.
+(topic, partition) or None
+
+
+
+#### RocksDBPartitionTransaction.as\_state
+
+```python
+def as_state(prefix: Any = DEFAULT_PREFIX) -> TransactionState
+```
+
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/transaction.py#L346)
+
+Create a one-time use `TransactionState` object with a limited CRUD interface
+
+to be provided to `StreamingDataFrame` operations.
+
+The `TransactionState` will prefix all the keys with the supplied `prefix`
+for all underlying operations.
**Arguments**:
-- `offset`: offset of the last processed message, optional.
+- `prefix`: a prefix to be used for all keys
-
+**Returns**:
-## quixstreams.state.rocksdb.partition
+an instance of `TransactionState`
-
+
-### RocksDBStorePartition
+## quixstreams.state.rocksdb
+
+
+
+## quixstreams.state.rocksdb.types
+
+
+
+## quixstreams.state.rocksdb.exceptions
+
+
+
+## quixstreams.state.rocksdb.serialization
+
+
+
+## quixstreams.state.recovery
+
+
+
+### RecoveryPartition
```python
-class RocksDBStorePartition(StorePartition)
+class RecoveryPartition()
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/partition.py#L40)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/recovery.py#L24)
-A base class to access state in RocksDB.
+A changelog topic partition mapped to a respective `StorePartition` with helper
+methods to determine its current recovery status.
-It represents a single RocksDB database.
+Since `StorePartition`s do recovery directly, it also handles recovery transactions.
-Responsibilities:
- 1. Managing access to the RocksDB instance
- 2. Creating transactions to interact with data
- 3. Flushing WriteBatches to the RocksDB
- 4. Producing state-related changelog messages
+
-It opens the RocksDB on `__init__`. If the db is locked by another process,
-it will retry according to `open_max_retries` and `open_retry_backoff` options.
+#### RecoveryPartition.offset
+
+```python
+@property
+def offset() -> int
+```
+
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/recovery.py#L63)
+
+Get the changelog offset from the underlying `StorePartition`.
+
+**Returns**:
+
+changelog offset (int)
+
+
+
+#### RecoveryPartition.needs\_recovery
+
+```python
+@property
+def needs_recovery()
+```
-**Arguments**:
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/recovery.py#L72)
-- `path`: an absolute path to the RocksDB folder
-- `options`: RocksDB options. If `None`, the default options will be used.
+Determine whether recovery is necessary for underlying `StorePartition`.
-
+
-#### RocksDBStorePartition.begin
+#### RecoveryPartition.needs\_offset\_update
```python
-def begin() -> RocksDBPartitionTransaction
+@property
+def needs_offset_update()
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/partition.py#L80)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/recovery.py#L81)
-Create a new `RocksDBTransaction` object.
+Determine if an offset update is required.
-Using `RocksDBTransaction` is a recommended way for accessing the data.
+Usually checked during assign if recovery was not required.
-**Returns**:
+
-an instance of `RocksDBTransaction`
+#### RecoveryPartition.update\_offset
-
+```python
+def update_offset()
+```
-#### RocksDBStorePartition.recover\_from\_changelog\_message
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/recovery.py#L89)
+
+Update only the changelog offset of a StorePartition.
+
+
+
+#### RecoveryPartition.recover\_from\_changelog\_message
```python
def recover_from_changelog_message(
changelog_message: ConfluentKafkaMessageProto)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/partition.py#L106)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/recovery.py#L109)
-Updates state from a given changelog message.
+Recover the StorePartition using a message read from its respective changelog.
**Arguments**:
-- `changelog_message`: A raw Confluent message read from a changelog topic.
+- `changelog_message`: A confluent kafka message (everything as bytes)
-
+
-#### RocksDBStorePartition.set\_changelog\_offset
+#### RecoveryPartition.set\_watermarks
```python
-def set_changelog_offset(changelog_offset: int)
+def set_watermarks(lowwater: int, highwater: int)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/partition.py#L130)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/recovery.py#L121)
-Set the changelog offset based on a message (usually an "offset-only" message).
-
-Used during recovery.
+Set the changelog watermarks as gathered from Consumer.get_watermark_offsets()
**Arguments**:
-- `changelog_offset`: A changelog offset
+- `lowwater`: topic partition lowwater
+- `highwater`: topic partition highwater
-
+
-#### RocksDBStorePartition.produce\_to\_changelog
+### ChangelogProducerFactory
```python
-def produce_to_changelog(key: bytes,
- value: Optional[bytes] = None,
- headers: Optional[MessageHeadersMapping] = None)
+class ChangelogProducerFactory()
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/partition.py#L140)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/recovery.py#L132)
-Produce a message to the StorePartitions respective changelog.
+Generates ChangelogProducers, which produce changelog messages to a StorePartition.
-
+
-#### RocksDBStorePartition.write
+#### ChangelogProducerFactory.\_\_init\_\_
```python
-def write(batch: WriteBatch)
+def __init__(changelog_name: str, producer: RowProducer)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/partition.py#L151)
-
-Write `WriteBatch` to RocksDB
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/recovery.py#L137)
**Arguments**:
-- `batch`: an instance of `rocksdict.WriteBatch`
+- `changelog_name`: changelog topic name
+- `producer`: a RowProducer (not shared with `Application` instance)
-
+**Returns**:
-#### RocksDBStorePartition.get
+a ChangelogWriter instance
+
+
+
+#### ChangelogProducerFactory.get\_partition\_producer
```python
-def get(key: bytes,
- default: Any = None,
- cf_name: str = "default") -> Union[None, bytes, Any]
+def get_partition_producer(partition_num) -> "ChangelogProducer"
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/partition.py#L158)
-
-Get a key from RocksDB.
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/recovery.py#L147)
-**Arguments**:
+Generate a ChangelogProducer for producing to a specific partition number
-- `key`: a key encoded to `bytes`
-- `default`: a default value to return if the key is not found.
-- `cf_name`: rocksdb column family name. Default - "default"
+(and thus StorePartition).
-**Returns**:
+**Arguments**:
-a value if the key is present in the DB. Otherwise, `default`
+- `partition_num`: source topic partition number
-
+
-#### RocksDBStorePartition.exists
+### ChangelogProducer
```python
-def exists(key: bytes, cf_name: str = "default") -> bool
+class ChangelogProducer()
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/partition.py#L172)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/recovery.py#L161)
-Check if a key is present in the DB.
+Generated for a `StorePartition` to produce state changes to its respective
+kafka changelog partition.
-**Arguments**:
+
-- `key`: a key encoded to `bytes`.
-- `cf_name`: rocksdb column family name. Default - "default"
+#### ChangelogProducer.\_\_init\_\_
-**Returns**:
+```python
+def __init__(changelog_name: str, partition: int, producer: RowProducer)
+```
-`True` if the key is present, `False` otherwise.
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/recovery.py#L167)
-
+**Arguments**:
-#### RocksDBStorePartition.get\_processed\_offset
+- `changelog_name`: A changelog topic name
+- `partition`: source topic partition number
+- `producer`: a RowProducer (not shared with `Application` instance)
+
+
+
+#### ChangelogProducer.produce
```python
-def get_processed_offset() -> Optional[int]
+def produce(key: bytes,
+ value: Optional[bytes] = None,
+ headers: Optional[MessageHeadersMapping] = None)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/partition.py#L183)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/recovery.py#L190)
-Get last processed offset for the given partition
+Produce a message to a changelog topic partition.
-**Returns**:
+**Arguments**:
-offset or `None` if there's no processed offset yet
+- `key`: message key (same as state key, including prefixes)
+- `value`: message value (same as state value)
+- `headers`: message headers (includes column family info)
-
+
-#### RocksDBStorePartition.get\_changelog\_offset
+### RecoveryManager
```python
-def get_changelog_offset() -> Optional[int]
+class RecoveryManager()
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/partition.py#L195)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/recovery.py#L215)
-Get offset that the changelog is up-to-date with.
+Manages all consumer-related aspects of recovery, including:
+ - assigning/revoking, pausing/resuming topic partitions (especially changelogs)
+ - consuming changelog messages until state is updated fully.
-**Returns**:
+Also tracks/manages `RecoveryPartitions`, which are assigned/tracked only if
+recovery for that changelog partition is required.
-offset or `None` if there's no processed offset yet
+Recovery is attempted from the `Application` after any new partition assignment.
-
+
-#### RocksDBStorePartition.close
+#### RecoveryManager.partitions
```python
-def close()
+@property
+def partitions() -> Dict[int, Dict[str, RecoveryPartition]]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/partition.py#L205)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/recovery.py#L234)
-Close the underlying RocksDB
+Returns a mapping of assigned RecoveryPartitions in the following format:
+{: {: }}
-
+
-#### RocksDBStorePartition.path
+#### RecoveryManager.has\_assignments
```python
@property
-def path() -> str
+def has_assignments() -> bool
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/partition.py#L220)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/recovery.py#L242)
-Absolute path to RocksDB database folder
+Whether the Application has assigned RecoveryPartitions
**Returns**:
-file path
+has assignments, as bool
-
+
-#### RocksDBStorePartition.destroy
+#### RecoveryManager.recovering
```python
-@classmethod
-def destroy(cls, path: str)
+@property
+def recovering() -> bool
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/partition.py#L228)
-
-Delete underlying RocksDB database
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/recovery.py#L251)
-The database must be closed first.
+Whether the Application is currently recovering
-**Arguments**:
+**Returns**:
-- `path`: an absolute path to the RocksDB folder
+is recovering, as bool
-
+
-#### RocksDBStorePartition.get\_column\_family\_handle
+#### RecoveryManager.register\_changelog
```python
-def get_column_family_handle(cf_name: str) -> ColumnFamily
+def register_changelog(topic_name: str, store_name: str,
+ consumer_group: str) -> Topic
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/partition.py#L238)
-
-Get a column family handle to pass to it WriteBatch.
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/recovery.py#L259)
-This method will cache the CF handle instance to avoid creating them
-repeatedly.
+Register a changelog Topic with the TopicManager.
**Arguments**:
-- `cf_name`: column family name
-
-**Returns**:
-
-instance of `rocksdict.ColumnFamily`
+- `topic_name`: source topic name
+- `store_name`: name of the store
+- `consumer_group`: name of the consumer group
-
+
-#### RocksDBStorePartition.get\_column\_family
+#### RecoveryManager.do\_recovery
```python
-def get_column_family(cf_name: str) -> Rdict
+def do_recovery()
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/partition.py#L259)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/recovery.py#L275)
-Get a column family instance.
+If there are any active RecoveryPartitions, do a recovery procedure.
-This method will cache the CF instance to avoid creating them repeatedly.
+After, will resume normal `Application` processing.
-**Arguments**:
+
-- `cf_name`: column family name
+#### RecoveryManager.assign\_partition
-**Returns**:
+```python
+def assign_partition(topic: str, partition: int, committed_offset: int,
+ store_partitions: Dict[str, StorePartition])
+```
-instance of `rocksdict.Rdict` for the given column family
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/recovery.py#L328)
-
+Assigns `StorePartition`s (as `RecoveryPartition`s) ONLY IF recovery required.
-## quixstreams.state.rocksdb.store
+Pauses active consumer partitions as needed.
-
+
-### RocksDBStore
+#### RecoveryManager.revoke\_partition
```python
-class RocksDBStore(Store)
+def revoke_partition(partition_num: int)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/store.py#L19)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/recovery.py#L395)
-RocksDB-based state store.
+revoke ALL StorePartitions (across all Stores) for a given partition number
-It keeps track of individual store partitions and provides access to the
-partitions' transactions.
+**Arguments**:
-
+- `partition_num`: partition number of source topic
-#### RocksDBStore.\_\_init\_\_
+
+
+## quixstreams.state
+
+
+
+## quixstreams.state.types
+
+
+
+### Store
```python
-def __init__(
- name: str,
- topic: str,
- base_dir: str,
- changelog_producer_factory: Optional[ChangelogProducerFactory] = None,
- options: Optional[options_type] = None)
+class Store(Protocol)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/store.py#L29)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L11)
-**Arguments**:
+Abstract state store.
-- `name`: a unique store name
-- `topic`: a topic name for this store
-- `base_dir`: path to a directory with the state
-- `changelog_producer_factory`: a ChangelogProducerFactory instance
-if using changelogs
-- `options`: RocksDB options. If `None`, the default options will be used.
+It keeps track of individual store partitions and provides access to the
+partitions' transactions.
-
+
-#### RocksDBStore.topic
+#### Store.topic
```python
@property
def topic() -> str
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/store.py#L53)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L22)
-Store topic name
+Topic name
-
+
-#### RocksDBStore.name
+#### Store.name
```python
@property
def name() -> str
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/store.py#L60)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L29)
Store name
-
+
-#### RocksDBStore.partitions
+#### Store.partitions
```python
@property
-def partitions() -> Dict[int, RocksDBStorePartition]
+def partitions() -> Dict[int, "StorePartition"]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/store.py#L67)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L36)
Mapping of assigned store partitions
-
+**Returns**:
-#### RocksDBStore.assign\_partition
+dict of "{partition: }"
+
+
+
+#### Store.assign\_partition
```python
-def assign_partition(partition: int) -> RocksDBStorePartition
+def assign_partition(partition: int) -> "StorePartition"
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/store.py#L80)
-
-Open and assign store partition.
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L43)
-If the partition is already assigned, it will not re-open it and return
-the existing partition instead.
+Assign new store partition
**Arguments**:
@@ -5020,40 +5208,37 @@ the existing partition instead.
**Returns**:
-instance of`RocksDBStorePartition`
+instance of `StorePartition`
-
+
-#### RocksDBStore.revoke\_partition
+#### Store.revoke\_partition
```python
def revoke_partition(partition: int)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/store.py#L117)
-
-Revoke and close the assigned store partition.
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L52)
-If the partition is not assigned, it will log the message and return.
+Revoke assigned store partition
**Arguments**:
- `partition`: partition number
-
+
-#### RocksDBStore.start\_partition\_transaction
+#### Store.start\_partition\_transaction
```python
-def start_partition_transaction(partition: int) -> RocksDBPartitionTransaction
+def start_partition_transaction(partition: int) -> "PartitionTransaction"
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/store.py#L138)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L60)
Start a new partition transaction.
-`RocksDBPartitionTransaction` is the primary interface for working with data in
-the underlying RocksDB.
+`PartitionTransaction` is the primary interface for working with data in Stores.
**Arguments**:
@@ -5061,109 +5246,148 @@ the underlying RocksDB.
**Returns**:
-instance of `RocksDBPartitionTransaction`
+instance of `PartitionTransaction`
-
+
-#### RocksDBStore.close
+#### Store.close
```python
def close()
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/store.py#L160)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L69)
-Close the store and revoke all assigned partitions
+Close store and revoke all store partitions
-
+
-## quixstreams.state.rocksdb.exceptions
+### StorePartition
-
+```python
+class StorePartition(Protocol)
+```
-## quixstreams.state.rocksdb.types
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L80)
-
+A base class to access state in the underlying storage.
+It represents a single instance of some storage (e.g. a single database for
+the persistent storage).
-## quixstreams.state.rocksdb.options
+
-
+#### StorePartition.path
-### RocksDBOptions
+```python
+@property
+def path() -> str
+```
+
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L89)
+
+Absolute path to RocksDB database folder
+
+
+
+#### StorePartition.begin
```python
-@dataclasses.dataclass(frozen=True)
-class RocksDBOptions(RocksDBOptionsType)
+def begin() -> "PartitionTransaction"
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/options.py#L25)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L95)
-RocksDB database options.
+State new `PartitionTransaction`
+
+
+
+#### StorePartition.recover\_from\_changelog\_message
+
+```python
+def recover_from_changelog_message(
+ changelog_message: ConfluentKafkaMessageProto, committed_offset: int)
+```
+
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L100)
+
+Updates state from a given changelog message.
**Arguments**:
-- `dumps`: function to dump data to JSON
-- `loads`: function to load data from JSON
-- `open_max_retries`: number of times to retry opening the database
-if it's locked by another process. To disable retrying, pass 0
-- `open_retry_backoff`: number of seconds to wait between each retry.
-Please see `rocksdict.Options` for a complete description of other options.
+- `changelog_message`: A raw Confluent message read from a changelog topic.
+- `committed_offset`: latest committed offset for the partition
-
+
-#### RocksDBOptions.to\_options
+#### StorePartition.get\_processed\_offset
```python
-def to_options() -> rocksdict.Options
+def get_processed_offset() -> Optional[int]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/options.py#L53)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L111)
-Convert parameters to `rocksdict.Options`
+Get last processed offset for the given partition
**Returns**:
-instance of `rocksdict.Options`
-
-
-
-## quixstreams.state.state
+offset or `None` if there's no processed offset yet
-
+
-### TransactionState
+#### StorePartition.get\_changelog\_offset
```python
-class TransactionState(State)
+def get_changelog_offset() -> Optional[int]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/state.py#L6)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L118)
-
+Get offset that the changelog is up-to-date with.
-#### TransactionState.\_\_init\_\_
+**Returns**:
+
+offset or `None` if there's no processed offset yet
+
+
+
+#### StorePartition.set\_changelog\_offset
```python
-def __init__(transaction: PartitionTransaction)
+def set_changelog_offset(changelog_offset: int)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/state.py#L9)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L125)
-Simple key-value state to be provided into `StreamingDataFrame` functions
+Set the changelog offset based on a message (usually an "offset-only" message).
+
+Used during recovery.
**Arguments**:
-- `transaction`: instance of `PartitionTransaction`
+- `changelog_offset`: A changelog offset
-
+
-#### TransactionState.get
+### State
+
+```python
+class State(Protocol)
+```
+
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L136)
+
+Primary interface for working with key-value state data from `StreamingDataFrame`
+
+
+
+#### State.get
```python
def get(key: Any, default: Any = None) -> Optional[Any]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/state.py#L17)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L141)
Get the value for key if key is present in the state, else default
@@ -5176,15 +5400,15 @@ Get the value for key if key is present in the state, else default
value or None if the key is not found and `default` is not provided
-
+
-#### TransactionState.set
+#### State.set
```python
def set(key: Any, value: Any)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/state.py#L27)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L151)
Set value for the key.
@@ -5193,15 +5417,15 @@ Set value for the key.
- `key`: key
- `value`: value
-
+
-#### TransactionState.delete
+#### State.delete
```python
def delete(key: Any)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/state.py#L35)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L159)
Delete value for the key.
@@ -5211,15 +5435,15 @@ This function always returns `None`, even if value is not found.
- `key`: key
-
+
-#### TransactionState.exists
+#### State.exists
```python
def exists(key: Any) -> bool
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/state.py#L44)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L168)
Check if the key exists in state.
@@ -5231,908 +5455,904 @@ Check if the key exists in state.
True if key exists, False otherwise
-
-
-## quixstreams.state
-
-
-
-## quixstreams.state.manager
-
-
-
-### StateStoreManager
-
-```python
-class StateStoreManager()
-```
-
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/manager.py#L31)
-
-Class for managing state stores and partitions.
-
-StateStoreManager is responsible for:
- - reacting to rebalance callbacks
- - managing the individual state stores
- - providing access to store transactions
-
-
+
-#### StateStoreManager.stores
+### PartitionTransaction
```python
-@property
-def stores() -> Dict[str, Dict[str, Store]]
+class PartitionTransaction(Protocol)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/manager.py#L71)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L177)
-Map of registered state stores
-
-**Returns**:
-
-dict in format {topic: {store_name: store}}
+A transaction class to perform simple key-value operations like
+"get", "set", "delete" and "exists" on a single storage partition.
-
+
-#### StateStoreManager.recovery\_required
+#### PartitionTransaction.as\_state
```python
-@property
-def recovery_required() -> bool
+def as_state(prefix: Any) -> State
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/manager.py#L79)
-
-Whether recovery needs to be done.
-
-
-
-#### StateStoreManager.using\_changelogs
-
-```python
-@property
-def using_changelogs() -> bool
-```
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L183)
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/manager.py#L88)
+Create an instance implementing the `State` protocol to be provided
-Whether the StateStoreManager is using changelog topics
+to `StreamingDataFrame` functions.
+All operations called on this State object will be prefixed with
+the supplied `prefix`.
**Returns**:
-using changelogs, as bool
+an instance implementing the `State` protocol
-
+
-#### StateStoreManager.do\_recovery
+#### PartitionTransaction.get
```python
-def do_recovery()
+def get(key: Any, prefix: bytes, default: Any = None) -> Optional[Any]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/manager.py#L96)
-
-Perform a state recovery, if necessary.
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L194)
-
+Get the value for key if key is present in the state, else default
-#### StateStoreManager.stop\_recovery
+**Arguments**:
-```python
-def stop_recovery()
-```
+- `key`: key
+- `prefix`: a key prefix
+- `default`: default value to return if the key is not found
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/manager.py#L102)
+**Returns**:
-Stop recovery (called during app shutdown).
+value or None if the key is not found and `default` is not provided
-
+
-#### StateStoreManager.get\_store
+#### PartitionTransaction.set
```python
-def get_store(topic: str,
- store_name: str = _DEFAULT_STATE_STORE_NAME) -> Store
+def set(key: Any, prefix: bytes, value: Any)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/manager.py#L108)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L205)
-Get a store for given name and topic
+Set value for the key.
**Arguments**:
-- `topic`: topic name
-- `store_name`: store name
-
-**Returns**:
-
-instance of `Store`
+- `key`: key
+- `prefix`: a key prefix
+- `value`: value
-
+
-#### StateStoreManager.register\_store
+#### PartitionTransaction.delete
```python
-def register_store(topic_name: str,
- store_name: str = _DEFAULT_STATE_STORE_NAME)
+def delete(key: Any, prefix: bytes)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/manager.py#L141)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L214)
-Register a state store to be managed by StateStoreManager.
-
-During processing, the StateStoreManager will react to rebalancing callbacks
-and assign/revoke the partitions for registered stores.
+Delete value for the key.
-Each store can be registered only once for each topic.
+This function always returns `None`, even if value is not found.
**Arguments**:
-- `topic_name`: topic name
-- `store_name`: store name
+- `key`: key
+- `prefix`: a key prefix
-
+
-#### StateStoreManager.register\_windowed\_store
+#### PartitionTransaction.exists
```python
-def register_windowed_store(topic_name: str, store_name: str)
+def exists(key: Any, prefix: bytes) -> bool
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/manager.py#L166)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L224)
-Register a windowed state store to be managed by StateStoreManager.
+Check if the key exists in state.
-During processing, the StateStoreManager will react to rebalancing callbacks
-and assign/revoke the partitions for registered stores.
+**Arguments**:
-Each window store can be registered only once for each topic.
+- `key`: key
+- `prefix`: a key prefix
-**Arguments**:
+**Returns**:
-- `topic_name`: topic name
-- `store_name`: store name
+True if key exists, False otherwise
-
+
-#### StateStoreManager.clear\_stores
+#### PartitionTransaction.failed
```python
-def clear_stores()
+@property
+def failed() -> bool
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/manager.py#L189)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L234)
-Delete all state stores managed by StateStoreManager.
+Return `True` if transaction failed to update data at some point.
-
+Failed transactions cannot be re-used.
-#### StateStoreManager.on\_partition\_assign
+**Returns**:
-```python
-def on_partition_assign(tp: TopicPartition) -> List[StorePartition]
-```
+bool
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/manager.py#L204)
+
-Assign store partitions for each registered store for the given `TopicPartition`
+#### PartitionTransaction.completed
-and return a list of assigned `StorePartition` objects.
+```python
+@property
+def completed() -> bool
+```
-**Arguments**:
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L244)
-- `tp`: `TopicPartition` from Kafka consumer
+Return `True` if transaction is successfully completed.
+
+Completed transactions cannot be re-used.
**Returns**:
-list of assigned `StorePartition`
+bool
-
+
-#### StateStoreManager.on\_partition\_revoke
+#### PartitionTransaction.prepared
```python
-def on_partition_revoke(tp: TopicPartition)
+@property
+def prepared() -> bool
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/manager.py#L223)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L254)
-Revoke store partitions for each registered store for the given `TopicPartition`
+Return `True` if transaction is prepared completed.
-**Arguments**:
+Prepared transactions cannot receive new updates, but can be flushed.
+
+**Returns**:
-- `tp`: `TopicPartition` from Kafka consumer
+bool
-
+
-#### StateStoreManager.on\_partition\_lost
+#### PartitionTransaction.prepare
```python
-def on_partition_lost(tp: TopicPartition)
+def prepare(processed_offset: int)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/manager.py#L235)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L263)
+
+Produce changelog messages to the changelog topic for all changes accumulated
+
+in this transaction and prepare transcation to flush its state to the state
+store.
-Revoke and close store partitions for each registered store for the given
+After successful `prepare()`, the transaction status is changed to PREPARED,
+and it cannot receive updates anymore.
-`TopicPartition`
+If changelog is disabled for this application, no updates will be produced
+to the changelog topic.
**Arguments**:
-- `tp`: `TopicPartition` from Kafka consumer
+- `processed_offset`: the offset of the latest processed message
-
+
-#### StateStoreManager.init
+#### PartitionTransaction.changelog\_topic\_partition
```python
-def init()
+@property
+def changelog_topic_partition() -> Optional[Tuple[str, int]]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/manager.py#L244)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L279)
-Initialize `StateStoreManager` and create a store directory
+Return the changelog topic-partition for the StorePartition of this transaction.
+Returns `None` if changelog_producer is not provided.
-
+**Returns**:
-#### StateStoreManager.close
+(topic, partition) or None
+
+
+
+#### PartitionTransaction.flush
```python
-def close()
+def flush(processed_offset: Optional[int] = None,
+ changelog_offset: Optional[int] = None)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/manager.py#L251)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L288)
-Close all registered stores
+Flush the recent updates to the storage.
-
+**Arguments**:
-#### StateStoreManager.get\_store\_transaction
+- `processed_offset`: offset of the last processed message, optional.
+- `changelog_offset`: offset of the last produced changelog message,
+optional.
-```python
-def get_store_transaction(
- store_name: str = _DEFAULT_STATE_STORE_NAME) -> PartitionTransaction
-```
+
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/manager.py#L259)
+### WindowedState
-Get active `PartitionTransaction` for the store
+```python
+class WindowedState(Protocol)
+```
-**Arguments**:
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L306)
-- `store_name`:
+A windowed state to be provided into `StreamingDataFrame` window functions.
-
+
-#### StateStoreManager.start\_store\_transaction
+#### WindowedState.get\_window
```python
-@contextlib.contextmanager
-def start_store_transaction(topic: str, partition: int,
- offset: int) -> Iterator["_MultiStoreTransaction"]
+def get_window(start_ms: int,
+ end_ms: int,
+ default: Any = None) -> Optional[Any]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/manager.py#L274)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L311)
-Starting the multi-store transaction for the Kafka message.
-
-This transaction will keep track of all used stores and flush them in the end.
-If any exception is caught during this transaction, none of them
-will be flushed as a best effort to keep stores consistent in "at-least-once" setting.
+Get the value of the window defined by `start` and `end` timestamps
-There can be only one active transaction at a time. Starting a new transaction
-before the end of the current one will fail.
+if the window is present in the state, else default
**Arguments**:
-- `topic`: message topic
-- `partition`: message partition
-- `offset`: message offset
+- `start_ms`: start of the window in milliseconds
+- `end_ms`: end of the window in milliseconds
+- `default`: default value to return if the key is not found
-
+**Returns**:
-## quixstreams.state.recovery
+value or None if the key is not found and `default` is not provided
-
+
-### RecoveryPartition
+#### WindowedState.update\_window
```python
-class RecoveryPartition()
+def update_window(start_ms: int, end_ms: int, value: Any, timestamp_ms: int)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/recovery.py#L20)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L325)
-A changelog topic partition mapped to a respective `StorePartition` with helper
-methods to determine its current recovery status.
+Set a value for the window.
-Since `StorePartition`s do recovery directly, it also handles recovery transactions.
+This method will also update the latest observed timestamp in state partition
+using the provided `timestamp`.
-
+**Arguments**:
-#### RecoveryPartition.offset
+- `start_ms`: start of the window in milliseconds
+- `end_ms`: end of the window in milliseconds
+- `value`: value of the window
+- `timestamp_ms`: current message timestamp in milliseconds
+
+
+
+#### WindowedState.get\_latest\_timestamp
```python
-@property
-def offset() -> int
+def get_latest_timestamp() -> int
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/recovery.py#L41)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L339)
-Get the changelog offset from the underlying `StorePartition`.
+Get the latest observed timestamp for the current state partition.
+
+Use this timestamp to determine if the arriving event is late and should be
+discarded from the processing.
**Returns**:
-changelog offset (int)
+latest observed event timestamp in milliseconds
-
+
-#### RecoveryPartition.needs\_recovery
+#### WindowedState.expire\_windows
```python
-@property
-def needs_recovery()
+def expire_windows(duration_ms: int, grace_ms: int = 0)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/recovery.py#L50)
-
-Determine whether recovery is necessary for underlying `StorePartition`.
-
-
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L350)
-#### RecoveryPartition.needs\_offset\_update
+Get a list of expired windows from RocksDB considering the current
-```python
-@property
-def needs_offset_update()
-```
+latest timestamp, window duration and grace period.
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/recovery.py#L59)
+It also marks the latest found window as expired in the expiration index, so
+calling this method multiple times will yield different results for the same
+"latest timestamp".
-Determine if an offset update is required.
+**Arguments**:
-Usually checked during assign if recovery was not required.
+- `duration_ms`: duration of the windows in milliseconds
+- `grace_ms`: grace period in milliseconds. Default - "0"
-
+
-#### RecoveryPartition.update\_offset
+### WindowedPartitionTransaction
```python
-def update_offset()
+class WindowedPartitionTransaction(Protocol)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/recovery.py#L67)
-
-Update only the changelog offset of a StorePartition.
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L365)
-
+
-#### RecoveryPartition.recover\_from\_changelog\_message
+#### WindowedPartitionTransaction.failed
```python
-def recover_from_changelog_message(
- changelog_message: ConfluentKafkaMessageProto)
+@property
+def failed() -> bool
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/recovery.py#L87)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L368)
-Recover the StorePartition using a message read from its respective changelog.
+Return `True` if transaction failed to update data at some point.
-**Arguments**:
+Failed transactions cannot be re-used.
-- `changelog_message`: A confluent kafka message (everything as bytes)
+**Returns**:
-
+bool
-#### RecoveryPartition.set\_watermarks
+
+
+#### WindowedPartitionTransaction.completed
```python
-def set_watermarks(lowwater: int, highwater: int)
+@property
+def completed() -> bool
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/recovery.py#L99)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L378)
-Set the changelog watermarks as gathered from Consumer.get_watermark_offsets()
+Return `True` if transaction is successfully completed.
-**Arguments**:
+Completed transactions cannot be re-used.
-- `lowwater`: topic partition lowwater
-- `highwater`: topic partition highwater
+**Returns**:
-
+bool
-### ChangelogProducerFactory
+
+
+#### WindowedPartitionTransaction.prepared
```python
-class ChangelogProducerFactory()
+@property
+def prepared() -> bool
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/recovery.py#L110)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L388)
-Generates ChangelogProducers, which produce changelog messages to a StorePartition.
+Return `True` if transaction is prepared completed.
-
+Prepared transactions cannot receive new updates, but can be flushed.
-#### ChangelogProducerFactory.\_\_init\_\_
+**Returns**:
+
+bool
+
+
+
+#### WindowedPartitionTransaction.prepare
```python
-def __init__(changelog_name: str, producer: RowProducer)
+def prepare(processed_offset: int)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/recovery.py#L115)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L397)
-**Arguments**:
+Produce changelog messages to the changelog topic for all changes accumulated
-- `changelog_name`: changelog topic name
-- `producer`: a RowProducer (not shared with `Application` instance)
+in this transaction and prepare transcation to flush its state to the state
+store.
-**Returns**:
+After successful `prepare()`, the transaction status is changed to PREPARED,
+and it cannot receive updates anymore.
-a ChangelogWriter instance
+If changelog is disabled for this application, no updates will be produced
+to the changelog topic.
-
+**Arguments**:
-#### ChangelogProducerFactory.get\_partition\_producer
+- `processed_offset`: the offset of the latest processed message
+
+
+
+#### WindowedPartitionTransaction.get\_window
```python
-def get_partition_producer(partition_num)
+def get_window(start_ms: int,
+ end_ms: int,
+ prefix: bytes,
+ default: Any = None) -> Optional[Any]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/recovery.py#L125)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L414)
-Generate a ChangelogProducer for producing to a specific partition number
+Get the value of the window defined by `start` and `end` timestamps
-(and thus StorePartition).
+if the window is present in the state, else default
**Arguments**:
-- `partition_num`: source topic partition number
+- `start_ms`: start of the window in milliseconds
+- `end_ms`: end of the window in milliseconds
+- `prefix`: a key prefix
+- `default`: default value to return if the key is not found
-
+**Returns**:
-### ChangelogProducer
+value or None if the key is not found and `default` is not provided
+
+
+
+#### WindowedPartitionTransaction.update\_window
```python
-class ChangelogProducer()
+def update_window(start_ms: int, end_ms: int, value: Any, timestamp_ms: int,
+ prefix: bytes)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/recovery.py#L137)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L433)
-Generated for a `StorePartition` to produce state changes to its respective
-kafka changelog partition.
+Set a value for the window.
-
+This method will also update the latest observed timestamp in state partition
+using the provided `timestamp`.
-#### ChangelogProducer.\_\_init\_\_
+**Arguments**:
+
+- `start_ms`: start of the window in milliseconds
+- `end_ms`: end of the window in milliseconds
+- `value`: value of the window
+- `timestamp_ms`: current message timestamp in milliseconds
+- `prefix`: a key prefix
+
+
+
+#### WindowedPartitionTransaction.get\_latest\_timestamp
```python
-def __init__(changelog_name: str, partition_num: int, producer: RowProducer)
+def get_latest_timestamp() -> int
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/recovery.py#L143)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L450)
-**Arguments**:
+Get the latest observed timestamp for the current state partition.
-- `changelog_name`: A changelog topic name
-- `partition_num`: source topic partition number
-- `producer`: a RowProducer (not shared with `Application` instance)
+Use this timestamp to determine if the arriving event is late and should be
+discarded from the processing.
-
+**Returns**:
-#### ChangelogProducer.produce
+latest observed event timestamp in milliseconds
+
+
+
+#### WindowedPartitionTransaction.expire\_windows
```python
-def produce(key: bytes,
- value: Optional[bytes] = None,
- headers: Optional[MessageHeadersMapping] = None)
+def expire_windows(duration_ms: int, prefix: bytes, grace_ms: int = 0)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/recovery.py#L153)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L461)
-Produce a message to a changelog topic partition.
+Get a list of expired windows from RocksDB considering the current
+
+latest timestamp, window duration and grace period.
+
+It also marks the latest found window as expired in the expiration index, so
+calling this method multiple times will yield different results for the same
+"latest timestamp".
**Arguments**:
-- `key`: message key (same as state key, including prefixes)
-- `value`: message value (same as state value)
-- `headers`: message headers (includes column family info)
+- `duration_ms`: duration of the windows in milliseconds
+- `prefix`: a key prefix
+- `grace_ms`: grace period in milliseconds. Default - "0"
-
+
-### RecoveryManager
+#### WindowedPartitionTransaction.flush
```python
-class RecoveryManager()
+def flush(processed_offset: Optional[int] = None,
+ changelog_offset: Optional[int] = None)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/recovery.py#L178)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L476)
-Manages all consumer-related aspects of recovery, including:
- - assigning/revoking, pausing/resuming topic partitions (especially changelogs)
- - consuming changelog messages until state is updated fully.
+Flush the recent updates to the storage.
-Also tracks/manages `RecoveryPartitions`, which are assigned/tracked only if
-recovery for that changelog partition is required.
+**Arguments**:
-Recovery is attempted from the `Application` after any new partition assignment.
+- `processed_offset`: offset of the last processed message, optional.
+- `changelog_offset`: offset of the last produced changelog message,
+optional.
-
+
-#### RecoveryManager.has\_assignments
+#### WindowedPartitionTransaction.changelog\_topic\_partition
```python
@property
-def has_assignments() -> bool
+def changelog_topic_partition() -> Optional[Tuple[str, int]]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/recovery.py#L197)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L490)
-Whether the Application has assigned RecoveryPartitions
+Return the changelog topic-partition for the StorePartition of this transaction.
+
+Returns `None` if changelog_producer is not provided.
**Returns**:
-has assignments, as bool
+(topic, partition) or None
-
+
-#### RecoveryManager.recovering
+### PartitionRecoveryTransaction
```python
-@property
-def recovering() -> bool
+class PartitionRecoveryTransaction(Protocol)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/recovery.py#L206)
-
-Whether the Application is currently recovering
-
-**Returns**:
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L504)
-is recovering, as bool
+A class for managing recovery for a StorePartition from a changelog message
-
+
-#### RecoveryManager.register\_changelog
+#### PartitionRecoveryTransaction.flush
```python
-def register_changelog(topic_name: str, store_name: str, consumer_group: str)
+def flush()
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/recovery.py#L214)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L511)
-Register a changelog Topic with the TopicManager.
-
-**Arguments**:
-
-- `topic_name`: source topic name
-- `store_name`: name of the store
-- `consumer_group`: name of the consumer group
+Flush the recovery update to the storage.
-
+
-#### RecoveryManager.do\_recovery
+### PartitionTransactionStatus
```python
-def do_recovery()
+class PartitionTransactionStatus(enum.Enum)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/recovery.py#L228)
-
-If there are any active RecoveryPartitions, do a recovery procedure.
-
-After, will resume normal `Application` processing.
-
-
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L518)
-#### RecoveryManager.assign\_partition
+
-```python
-def assign_partition(topic_name: str, partition_num: int,
- store_partitions: Dict[str, StorePartition])
-```
+#### STARTED
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/recovery.py#L274)
+Transaction is started and accepts updates
-Assigns `StorePartition`s (as `RecoveryPartition`s) ONLY IF recovery required.
+
-Pauses active consumer partitions as needed.
+#### PREPARED
-
+Transaction is prepared, it can no longer receive updates
-#### RecoveryManager.revoke\_partition
+
-```python
-def revoke_partition(partition_num: int)
-```
+#### COMPLETE
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/recovery.py#L336)
+Transaction is fully completed, it cannot be used anymore
-revoke ALL StorePartitions (across all Stores) for a given partition number
+
-**Arguments**:
+#### FAILED
-- `partition_num`: partition number of source topic
+Transaction is failed, it cannot be used anymore
## quixstreams.state.exceptions
-
+
-## quixstreams.state.types
+## quixstreams.state.manager
-
+
-### Store
+### StateStoreManager
```python
-class Store(Protocol)
+class StateStoreManager()
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L14)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/manager.py#L24)
-Abstract state store.
+Class for managing state stores and partitions.
-It keeps track of individual store partitions and provides access to the
-partitions' transactions.
+StateStoreManager is responsible for:
+ - reacting to rebalance callbacks
+ - managing the individual state stores
+ - providing access to store transactions
-
+
-#### Store.topic
+#### StateStoreManager.stores
```python
@property
-def topic() -> str
+def stores() -> Dict[str, Dict[str, Store]]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L25)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/manager.py#L63)
-Topic name
+Map of registered state stores
-
+**Returns**:
-#### Store.name
+dict in format {topic: {store_name: store}}
+
+
+
+#### StateStoreManager.recovery\_required
```python
@property
-def name() -> str
+def recovery_required() -> bool
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L32)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/manager.py#L71)
-Store name
+Whether recovery needs to be done.
-
+
-#### Store.partitions
+#### StateStoreManager.using\_changelogs
```python
@property
-def partitions() -> Dict[int, "StorePartition"]
+def using_changelogs() -> bool
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L39)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/manager.py#L80)
-Mapping of assigned store partitions
+Whether the StateStoreManager is using changelog topics
**Returns**:
-dict of "{partition: }"
+using changelogs, as bool
-
+
-#### Store.assign\_partition
+#### StateStoreManager.do\_recovery
```python
-def assign_partition(partition: int) -> "StorePartition"
+def do_recovery()
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L46)
-
-Assign new store partition
-
-**Arguments**:
-
-- `partition`: partition number
-
-**Returns**:
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/manager.py#L88)
-instance of `StorePartition`
+Perform a state recovery, if necessary.
-
+
-#### Store.revoke\_partition
+#### StateStoreManager.stop\_recovery
```python
-def revoke_partition(partition: int)
+def stop_recovery()
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L55)
-
-Revoke assigned store partition
-
-**Arguments**:
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/manager.py#L94)
-- `partition`: partition number
+Stop recovery (called during app shutdown).
-
+
-#### Store.start\_partition\_transaction
+#### StateStoreManager.get\_store
```python
-def start_partition_transaction(
- partition: int) -> Optional["PartitionTransaction"]
+def get_store(topic: str, store_name: str = DEFAULT_STATE_STORE_NAME) -> Store
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L63)
-
-Start a new partition transaction.
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/manager.py#L100)
-`PartitionTransaction` is the primary interface for working with data in Stores.
+Get a store for given name and topic
**Arguments**:
-- `partition`: partition number
+- `topic`: topic name
+- `store_name`: store name
**Returns**:
-instance of `PartitionTransaction`
+instance of `Store`
-
+
-#### Store.close
+#### StateStoreManager.register\_store
```python
-def close()
+def register_store(topic_name: str,
+ store_name: str = DEFAULT_STATE_STORE_NAME)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L74)
-
-Close store and revoke all store partitions
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/manager.py#L134)
-
+Register a state store to be managed by StateStoreManager.
-### StorePartition
+During processing, the StateStoreManager will react to rebalancing callbacks
+and assign/revoke the partitions for registered stores.
-```python
-class StorePartition(Protocol)
-```
+Each store can be registered only once for each topic.
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L85)
+**Arguments**:
-A base class to access state in the underlying storage.
-It represents a single instance of some storage (e.g. a single database for
-the persistent storage).
+- `topic_name`: topic name
+- `store_name`: store name
-
+
-#### StorePartition.path
+#### StateStoreManager.register\_windowed\_store
```python
-@property
-def path() -> str
+def register_windowed_store(topic_name: str, store_name: str)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L94)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/manager.py#L159)
-Absolute path to RocksDB database folder
+Register a windowed state store to be managed by StateStoreManager.
-
+During processing, the StateStoreManager will react to rebalancing callbacks
+and assign/revoke the partitions for registered stores.
-#### StorePartition.begin
+Each window store can be registered only once for each topic.
+
+**Arguments**:
+
+- `topic_name`: topic name
+- `store_name`: store name
+
+
+
+#### StateStoreManager.clear\_stores
```python
-def begin() -> "PartitionTransaction"
+def clear_stores()
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L100)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/manager.py#L182)
-State new `PartitionTransaction`
+Delete all state stores managed by StateStoreManager.
-
+
-#### StorePartition.recover\_from\_changelog\_message
+#### StateStoreManager.on\_partition\_assign
```python
-def recover_from_changelog_message(
- changelog_message: ConfluentKafkaMessageProto)
+def on_partition_assign(topic: str, partition: int,
+ committed_offset: int) -> List[StorePartition]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L105)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/manager.py#L197)
-Updates state from a given changelog message.
+Assign store partitions for each registered store for the given `TopicPartition`
+
+and return a list of assigned `StorePartition` objects.
**Arguments**:
-- `changelog_message`: A raw Confluent message read from a changelog topic.
+- `topic`: Kafka topic name
+- `partition`: Kafka topic partition
+- `committed_offset`: latest committed offset for the partition
+
+**Returns**:
+
+list of assigned `StorePartition`
-
+
-#### StorePartition.produce\_to\_changelog
+#### StateStoreManager.on\_partition\_revoke
```python
-def produce_to_changelog(key: bytes,
- value: Optional[bytes] = None,
- headers: Optional[MessageHeadersMapping] = None)
+def on_partition_revoke(topic: str, partition: int)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L115)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/manager.py#L223)
-Produce a message to the StorePartitions respective changelog.
+Revoke store partitions for each registered store for the given `TopicPartition`
-
+**Arguments**:
+
+- `topic`: Kafka topic name
+- `partition`: Kafka topic partition
+
+
-#### StorePartition.get\_processed\_offset
+#### StateStoreManager.init
```python
-def get_processed_offset() -> Optional[int]
+def init()
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L126)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/manager.py#L236)
-Get last processed offset for the given partition
-
-**Returns**:
+Initialize `StateStoreManager` and create a store directory
-offset or `None` if there's no processed offset yet
-
+
-#### StorePartition.get\_changelog\_offset
+#### StateStoreManager.close
```python
-def get_changelog_offset() -> Optional[int]
+def close()
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L133)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/manager.py#L243)
-Get offset that the changelog is up-to-date with.
+Close all registered stores
-**Returns**:
+
-offset or `None` if there's no processed offset yet
+## quixstreams.state.state
-
+
-#### StorePartition.set\_changelog\_offset
+### TransactionState
```python
-def set_changelog_offset(changelog_offset: int)
+class TransactionState(State)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L140)
-
-Set the changelog offset based on a message (usually an "offset-only" message).
-
-Used during recovery.
-
-**Arguments**:
-
-- `changelog_offset`: A changelog offset
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/state.py#L6)
-
+
-### State
+#### TransactionState.\_\_init\_\_
```python
-class State(Protocol)
+def __init__(prefix: bytes, transaction: PartitionTransaction)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L151)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/state.py#L12)
-Primary interface for working with key-value state data from `StreamingDataFrame`
+Simple key-value state to be provided into `StreamingDataFrame` functions
-
+**Arguments**:
-#### State.get
+- `transaction`: instance of `PartitionTransaction`
+
+
+
+#### TransactionState.get
```python
def get(key: Any, default: Any = None) -> Optional[Any]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L156)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/state.py#L21)
Get the value for key if key is present in the state, else default
@@ -6145,15 +6365,15 @@ Get the value for key if key is present in the state, else default
value or None if the key is not found and `default` is not provided
-
+
-#### State.set
+#### TransactionState.set
```python
def set(key: Any, value: Any)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L166)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/state.py#L31)
Set value for the key.
@@ -6162,15 +6382,15 @@ Set value for the key.
- `key`: key
- `value`: value
-
+
-#### State.delete
+#### TransactionState.delete
```python
def delete(key: Any)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L174)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/state.py#L39)
Delete value for the key.
@@ -6180,15 +6400,15 @@ This function always returns `None`, even if value is not found.
- `key`: key
-
+
-#### State.exists
+#### TransactionState.exists
```python
def exists(key: Any) -> bool
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L183)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/state.py#L48)
Check if the key exists in state.
@@ -6200,743 +6420,813 @@ Check if the key exists in state.
True if key exists, False otherwise
-
+
-### PartitionTransaction
+## quixstreams.exceptions
+
+
+
+## quixstreams.exceptions.assignment
+
+
+
+### PartitionAssignmentError
```python
-class PartitionTransaction(State)
+class PartitionAssignmentError(QuixException)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L192)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/exceptions/assignment.py#L6)
-A transaction class to perform simple key-value operations like
-"get", "set", "delete" and "exists" on a single storage partition.
+Error happened during partition rebalancing.
+Raised from `on_assign`, `on_revoke` and `on_lost` callbacks
+
+
+
+## quixstreams.exceptions.base
+
+
+
+## quixstreams.context
-
+
-#### PartitionTransaction.state
+#### set\_message\_context
```python
-@property
-def state() -> State
+def set_message_context(context: Optional[MessageContext])
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L199)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/context.py#L21)
-An instance of State to be provided to `StreamingDataFrame` functions
+Set a MessageContext for the current message in the given `contextvars.Context`
+>***NOTE:*** This is for advanced usage only. If you need to change the message key,
+`StreamingDataFrame.to_topic()` has an argument for it.
-
-#### PartitionTransaction.failed
+Example Snippet:
```python
-@property
-def failed() -> bool
-```
-
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L207)
+from quixstreams import Application, set_message_context, message_context
-Return `True` if transaction failed to update data at some point.
+# Changes the current sdf value based on what the message partition is.
+def alter_context(value):
+ context = message_context()
+ if value > 1:
+ context.headers = context.headers + (b"cool_new_header", value.encode())
+ set_message_context(context)
-Failed transactions cannot be re-used.
+app = Application()
+sdf = app.dataframe()
+sdf = sdf.update(lambda value: alter_context(value))
+```
-**Returns**:
+**Arguments**:
-bool
+- `context`: instance of `MessageContext`
-
+
-#### PartitionTransaction.completed
+#### message\_context
```python
-@property
-def completed() -> bool
+def message_context() -> MessageContext
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L217)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/context.py#L52)
-Return `True` if transaction is completed.
+Get a MessageContext for the current message, which houses most of the message
-Completed transactions cannot be re-used.
+metadata, like:
+ - key
+ - timestamp
+ - partition
+ - offset
-**Returns**:
-bool
+Example Snippet:
-
+```python
+from quixstreams import Application, message_context
-#### PartitionTransaction.with\_prefix
+# Changes the current sdf value based on what the message partition is.
-```python
-@contextlib.contextmanager
-def with_prefix(prefix: Any = b"") -> Iterator[Self]
+app = Application()
+sdf = app.dataframe()
+sdf = sdf.apply(lambda value: 1 if message_context().partition == 2 else 0)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L227)
-
-A context manager set the prefix for all keys in the scope.
+**Returns**:
-Normally, it's called by `StreamingDataFrame` internals to ensure that every
-message key is stored separately.
+instance of `MessageContext`
-**Arguments**:
+
-- `prefix`: key prefix
+#### message\_key
-**Returns**:
+```python
+def message_key() -> Any
+```
-context manager
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/context.py#L83)
-
+Get the current message's key.
-#### PartitionTransaction.maybe\_flush
+Example Snippet:
```python
-def maybe_flush(offset: Optional[int] = None)
+from quixstreams import Application, message_key
+
+# Changes the current sdf value based on what the message key is.
+
+app = Application()
+sdf = app.dataframe()
+sdf = sdf.apply(lambda value: 1 if message_key() == b'1' else 0)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L238)
+**Returns**:
-Flush the recent updates and last processed offset to the storage.
+a deserialized message key
-**Arguments**:
+
-- `offset`: offset of the last processed message, optional.
+## quixstreams.kafka
-
+
-### WindowedState
+## quixstreams.kafka.producer
+
+
+
+### Producer
```python
-class WindowedState(Protocol)
+class Producer()
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L249)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/producer.py#L37)
-A windowed state to be provided into `StreamingDataFrame` window functions.
-
-
+
-#### WindowedState.get\_window
+#### Producer.\_\_init\_\_
```python
-def get_window(start_ms: int,
- end_ms: int,
- default: Any = None) -> Optional[Any]
+def __init__(broker_address: str,
+ partitioner: Partitioner = "murmur2",
+ extra_config: Optional[dict] = None)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L254)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/producer.py#L38)
-Get the value of the window defined by `start` and `end` timestamps
+A wrapper around `confluent_kafka.Producer`.
-if the window is present in the state, else default
+It initializes `confluent_kafka.Producer` on demand
+avoiding network calls during `__init__`, provides typing info for methods
+and some reasonable defaults.
**Arguments**:
-- `start_ms`: start of the window in milliseconds
-- `end_ms`: end of the window in milliseconds
-- `default`: default value to return if the key is not found
-
-**Returns**:
-
-value or None if the key is not found and `default` is not provided
+- `broker_address`: Kafka broker host and port in format `:`.
+Passed as `bootstrap.servers` to `confluent_kafka.Producer`.
+- `partitioner`: A function to be used to determine the outgoing message
+partition.
+Available values: "random", "consistent_random", "murmur2", "murmur2_random",
+"fnv1a", "fnv1a_random"
+Default - "murmur2".
+- `extra_config`: A dictionary with additional options that
+will be passed to `confluent_kafka.Producer` as is.
+Note: values passed as arguments override values in `extra_config`.
-
+
-#### WindowedState.update\_window
+#### Producer.produce
```python
-def update_window(start_ms: int, end_ms: int, value: Any, timestamp_ms: int)
+def produce(topic: str,
+ value: Optional[Union[str, bytes]] = None,
+ key: Optional[Union[str, bytes]] = None,
+ headers: Optional[Headers] = None,
+ partition: Optional[int] = None,
+ timestamp: Optional[int] = None,
+ poll_timeout: float = 5.0,
+ buffer_error_max_tries: int = 3,
+ on_delivery: Optional[DeliveryCallback] = None)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L268)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/producer.py#L74)
-Set a value for the window.
+Produce a message to a topic.
-This method will also update the latest observed timestamp in state partition
-using the provided `timestamp`.
+It also polls Kafka for callbacks before producing to minimize
+the probability of `BufferError`.
+If `BufferError` still happens, the method will poll Kafka with timeout
+to free up the buffer and try again.
**Arguments**:
-- `start_ms`: start of the window in milliseconds
-- `end_ms`: end of the window in milliseconds
-- `value`: value of the window
-- `timestamp_ms`: current message timestamp in milliseconds
+- `topic`: topic name
+- `value`: message value
+- `key`: message key
+- `headers`: message headers
+- `partition`: topic partition
+- `timestamp`: message timestamp
+- `poll_timeout`: timeout for `poll()` call in case of `BufferError`
+- `buffer_error_max_tries`: max retries for `BufferError`.
+Pass `0` to not retry after `BufferError`.
+- `on_delivery`: the delivery callback to be triggered on `poll()`
+for the produced message.
-
+
-#### WindowedState.get\_latest\_timestamp
+#### Producer.poll
```python
-def get_latest_timestamp() -> int
+def poll(timeout: float = 0)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L282)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/producer.py#L135)
-Get the latest observed timestamp for the current state partition.
-
-Use this timestamp to determine if the arriving event is late and should be
-discarded from the processing.
+Polls the producer for events and calls `on_delivery` callbacks.
-**Returns**:
+**Arguments**:
-latest observed event timestamp in milliseconds
+- `timeout`: poll timeout seconds; Default: 0 (unlike others)
+> NOTE: -1 will hang indefinitely if there are no messages to acknowledge
-
+
-#### WindowedState.expire\_windows
+#### Producer.flush
```python
-def expire_windows(duration_ms: int, grace_ms: int = 0)
+def flush(timeout: Optional[float] = None) -> int
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L293)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/producer.py#L143)
-Get a list of expired windows from RocksDB considering the current
+Wait for all messages in the Producer queue to be delivered.
-latest timestamp, window duration and grace period.
+**Arguments**:
-It also marks the latest found window as expired in the expiration index, so
-calling this method multiple times will yield different results for the same
-"latest timestamp".
+- `timeout` (`float`): time to attempt flushing (seconds).
+None or -1 is infinite. Default: None
-**Arguments**:
+**Returns**:
-- `duration_ms`: duration of the windows in milliseconds
-- `grace_ms`: grace period in milliseconds. Default - "0"
+number of messages remaining to flush
-
+
-### WindowedPartitionTransaction
+## quixstreams.kafka.consumer
+
+
+
+### Consumer
```python
-class WindowedPartitionTransaction(WindowedState)
+class Consumer()
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L308)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L66)
-
+
-#### WindowedPartitionTransaction.failed
+#### Consumer.\_\_init\_\_
```python
-@property
-def failed() -> bool
+def __init__(broker_address: str,
+ consumer_group: Optional[str],
+ auto_offset_reset: AutoOffsetReset,
+ auto_commit_enable: bool = True,
+ assignment_strategy: AssignmentStrategy = "range",
+ on_commit: Optional[Callable[
+ [Optional[KafkaError], List[TopicPartition]], None]] = None,
+ extra_config: Optional[dict] = None)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L313)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L67)
-Return `True` if transaction failed to update data at some point.
+A wrapper around `confluent_kafka.Consumer`.
-Failed transactions cannot be re-used.
+It initializes `confluent_kafka.Consumer` on demand
+avoiding network calls during `__init__`, provides typing info for methods
+and some reasonable defaults.
-**Returns**:
+**Arguments**:
-bool
+- `broker_address`: Kafka broker host and port in format `:`.
+Passed as `bootstrap.servers` to `confluent_kafka.Consumer`.
+- `consumer_group`: Kafka consumer group.
+Passed as `group.id` to `confluent_kafka.Consumer`
+- `auto_offset_reset`: Consumer `auto.offset.reset` setting.
+Available values:
+- "earliest" - automatically reset the offset to the smallest offset
+- "latest" - automatically reset the offset to the largest offset
+- "error" - trigger an error (ERR__AUTO_OFFSET_RESET) which is retrieved
+ by consuming messages (used for testing)
+- `auto_commit_enable`: If true, periodically commit offset of
+the last message handed to the application. Default - `True`.
+- `assignment_strategy`: The name of a partition assignment strategy.
+Available values: "range", "roundrobin", "cooperative-sticky".
+- `on_commit`: Offset commit result propagation callback.
+Passed as "offset_commit_cb" to `confluent_kafka.Consumer`.
+- `extra_config`: A dictionary with additional options that
+will be passed to `confluent_kafka.Consumer` as is.
+Note: values passed as arguments override values in `extra_config`.
-
+
-#### WindowedPartitionTransaction.completed
+#### Consumer.poll
```python
-@property
-def completed() -> bool
+def poll(timeout: Optional[float] = None) -> Optional[Message]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L323)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L126)
-Return `True` if transaction is completed.
+Consumes a single message, calls callbacks and returns events.
-Completed transactions cannot be re-used.
+The application must check the returned :py:class:`Message`
+object's :py:func:`Message.error()` method to distinguish between proper
+messages (error() returns None), or an event or error.
+
+Note: Callbacks may be called from this method, such as
+``on_assign``, ``on_revoke``, et al.
+
+**Arguments**:
+
+- `timeout` (`float`): Maximum time in seconds to block waiting for message,
+event or callback. None or -1 is infinite. Default: None.
+
+**Raises**:
+
+- `None`: RuntimeError if called on a closed consumer
**Returns**:
-bool
+A Message object or None on timeout
-
+
-#### WindowedPartitionTransaction.with\_prefix
+#### Consumer.subscribe
```python
-def with_prefix(prefix: Any = b"") -> Iterator[Self]
+def subscribe(topics: List[str],
+ on_assign: Optional[RebalancingCallback] = None,
+ on_revoke: Optional[RebalancingCallback] = None,
+ on_lost: Optional[RebalancingCallback] = None)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L332)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L144)
-A context manager set the prefix for all keys in the scope.
+Set subscription to supplied list of topics
-Normally, it's called by `StreamingDataFrame` internals to ensure that every
-message key is stored separately.
+This replaces a previous subscription.
**Arguments**:
-- `prefix`: key prefix
+- `topics` (`list(str)`): List of topics (strings) to subscribe to.
+- `on_assign` (`callable`): callback to provide handling of customized offsets
+on completion of a successful partition re-assignment.
+- `on_revoke` (`callable`): callback to provide handling of offset commits to
+a customized store on the start of a rebalance operation.
+- `on_lost` (`callable`): callback to provide handling in the case the partition
+assignment has been lost. Partitions that have been lost may already be
+owned by other members in the group and therefore committing offsets,
+for example, may fail.
+
+**Raises**:
-**Returns**:
+- `KafkaException`:
+- `None`: RuntimeError if called on a closed consumer
+.. py:function:: on_assign(consumer, partitions)
+.. py:function:: on_revoke(consumer, partitions)
+.. py:function:: on_lost(consumer, partitions)
-context manager
+ :param Consumer consumer: Consumer instance.
+ :param list(TopicPartition) partitions: Absolute list of partitions being
+ assigned or revoked.
-
+
-#### WindowedPartitionTransaction.maybe\_flush
+#### Consumer.unsubscribe
```python
-def maybe_flush(offset: Optional[int] = None)
+def unsubscribe()
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L343)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L238)
-Flush the recent updates and last processed offset to the storage.
+Remove current subscription.
-**Arguments**:
+**Raises**:
-- `offset`: offset of the last processed message, optional.
+- `None`: KafkaException
+- `None`: RuntimeError if called on a closed consumer
-
+
-### PartitionRecoveryTransaction
+#### Consumer.store\_offsets
```python
-class PartitionRecoveryTransaction(Protocol)
+def store_offsets(message: Optional[Message] = None,
+ offsets: Optional[List[TopicPartition]] = None)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L354)
-
-A class for managing recovery for a StorePartition from a changelog message
-
-
-
-#### PartitionRecoveryTransaction.flush
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L246)
-```python
-def flush()
-```
+.. py:function:: store_offsets([message=None], [offsets=None])
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L361)
+Store offsets for a message or a list of offsets.
-Flush the recovery update and last processed offset to the storage.
+``message`` and ``offsets`` are mutually exclusive. The stored offsets
+will be committed according to 'auto.commit.interval.ms' or manual
+offset-less `commit`.
+Note that 'enable.auto.offset.store' must be set to False when using this API.
-
+**Arguments**:
-## quixstreams.utils
+- `message` (`confluent_kafka.Message`): Store message's offset+1.
+- `offsets` (`list(TopicPartition)`): List of topic+partitions+offsets to store.
-
+**Raises**:
-## quixstreams.utils.json
+- `None`: KafkaException
+- `None`: RuntimeError if called on a closed consumer
-
+
-#### dumps
+#### Consumer.commit
```python
-def dumps(value: Any) -> bytes
+def commit(message: Optional[Message] = None,
+ offsets: Optional[List[TopicPartition]] = None,
+ asynchronous: bool = True) -> Optional[List[TopicPartition]]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/utils/json.py#L8)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L280)
-Serialize to JSON using `orjson` package.
+Commit a message or a list of offsets.
+
+The ``message`` and ``offsets`` parameters are mutually exclusive.
+If neither is set, the current partition assignment's offsets are used instead.
+Use this method to commit offsets if you have 'enable.auto.commit' set to False.
**Arguments**:
-- `value`: value to serialize to JSON
+- `message` (`confluent_kafka.Message`): Commit the message's offset+1.
+Note: By convention, committed offsets reflect the next message
+to be consumed, **not** the last message consumed.
+- `offsets` (`list(TopicPartition)`): List of topic+partitions+offsets to commit.
+- `asynchronous` (`bool`): If true, asynchronously commit, returning None
+immediately. If False, the commit() call will block until the commit
+succeeds or fails and the committed offsets will be returned (on success).
+Note that specific partitions may have failed and the .err field of
+each partition should be checked for success.
-**Returns**:
+**Raises**:
-bytes
+- `None`: KafkaException
+- `None`: RuntimeError if called on a closed consumer
-
+
-#### loads
+#### Consumer.committed
```python
-def loads(value: bytes) -> Any
+def committed(partitions: List[TopicPartition],
+ timeout: Optional[float] = None) -> List[TopicPartition]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/utils/json.py#L18)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L320)
-Deserialize from JSON using `orjson` package.
+.. py:function:: committed(partitions, [timeout=None])
-Main differences:
-- It returns `bytes`
-- It doesn't allow non-str keys in dictionaries
+Retrieve committed offsets for the specified partitions.
**Arguments**:
-- `value`: value to deserialize from
+- `partitions` (`list(TopicPartition)`): List of topic+partitions to query for stored offsets.
+- `timeout` (`float`): Request timeout (seconds).
+None or -1 is infinite. Default: None
-**Returns**:
+**Raises**:
-object
+- `None`: KafkaException
+- `None`: RuntimeError if called on a closed consumer
-
+**Returns**:
-## quixstreams.utils.dicts
+`list(TopicPartition)`: List of topic+partitions with offset and possibly error set.
-
+
-#### dict\_values
+#### Consumer.get\_watermark\_offsets
```python
-def dict_values(d: object) -> List
+def get_watermark_offsets(partition: TopicPartition,
+ timeout: Optional[float] = None,
+ cached: bool = False) -> Tuple[int, int]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/utils/dicts.py#L4)
-
-Recursively unpacks a set of nested dicts to get a flattened list of leaves,
-
-where "leaves" are the first non-dict item.
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L340)
-i.e {"a": {"b": {"c": 1}, "d": 2}, "e": 3} becomes [1, 2, 3]
+Retrieve low and high offsets for the specified partition.
**Arguments**:
-- `d`: initially, a dict (with potentially nested dicts)
-
-**Returns**:
-
-a list with all the leaves of the various contained dicts
+- `partition` (`TopicPartition`): Topic+partition to return offsets for.
+- `timeout` (`float`): Request timeout (seconds). None or -1 is infinite.
+Ignored if cached=True. Default: None
+- `cached` (`bool`): Instead of querying the broker, use cached information.
+Cached values: The low offset is updated periodically
+(if statistics.interval.ms is set) while the high offset is updated on each
+message fetched from the broker for this partition.
-
+**Raises**:
-## quixstreams.types
+- `None`: KafkaException
+- `None`: RuntimeError if called on a closed consumer
-
+**Returns**:
-## quixstreams.logging
+`tuple(int,int)`: Tuple of (low,high) on success or None on timeout.
+The high offset is the offset of the last message + 1.
-
+
-#### configure\_logging
+#### Consumer.list\_topics
```python
-def configure_logging(loglevel: Optional[LogLevel]) -> bool
+def list_topics(topic: Optional[str] = None,
+ timeout: Optional[float] = None) -> ClusterMetadata
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/logging.py#L24)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L366)
-Configure "quixstreams" logger.
+.. py:function:: list_topics([topic=None], [timeout=-1])
->***NOTE:*** If "quixstreams" logger already has pre-defined handlers
-(e.g. logging has already been configured via `logging`, or the function
-is called twice), it will skip configuration and return `False`.
+Request metadata from the cluster.
+This method provides the same information as
+listTopics(), describeTopics() and describeCluster() in the Java Admin client.
**Arguments**:
-- `loglevel`: a valid log level as a string or None.
-If None passed, this function is no-op and no logging will be configured.
-
-**Returns**:
-
-True if logging config has been updated, otherwise False.
+- `topic` (`str`): If specified, only request information about this topic,
+else return results for all topics in cluster.
+Warning: If auto.create.topics.enable is set to true on the broker and
+an unknown topic is specified, it will be created.
+- `timeout` (`float`): The maximum response time before timing out
+None or -1 is infinite. Default: None
-
+**Raises**:
-## quixstreams.context
+- `None`: KafkaException
-
+
-#### set\_message\_context
+#### Consumer.memberid
```python
-def set_message_context(context: Optional[MessageContext])
+def memberid() -> str
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/context.py#L21)
-
-Set a MessageContext for the current message in the given `contextvars.Context`
-
->***NOTE:*** This is for advanced usage only. If you need to change the message key,
-`StreamingDataFrame.to_topic()` has an argument for it.
-
-
-Example Snippet:
-
-```python
-from quixstreams import Application, set_message_context, message_context
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L389)
-# Changes the current sdf value based on what the message partition is.
-def alter_context(value):
- context = message_context()
- if value > 1:
- context.headers = context.headers + (b"cool_new_header", value.encode())
- set_message_context(context)
+Return this client's broker-assigned group member id.
-app = Application()
-sdf = app.dataframe()
-sdf = sdf.update(lambda value: alter_context(value))
-```
+The member id is assigned by the group coordinator and is propagated to
+the consumer during rebalance.
-**Arguments**:
+ :returns: Member id string or None
+ :rtype: string
+ :raises: RuntimeError if called on a closed consumer
-- `context`: instance of `MessageContext`
-
+
-#### message\_context
+#### Consumer.offsets\_for\_times
```python
-def message_context() -> MessageContext
+def offsets_for_times(partitions: List[TopicPartition],
+ timeout: Optional[float] = None) -> List[TopicPartition]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/context.py#L52)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L402)
-Get a MessageContext for the current message, which houses most of the message
+Look up offsets by timestamp for the specified partitions.
-metadata, like:
- - key
- - timestamp
- - partition
- - offset
+The returned offset for each partition is the earliest offset whose
+timestamp is greater than or equal to the given timestamp in the
+corresponding partition. If the provided timestamp exceeds that of the
+last message in the partition, a value of -1 will be returned.
+ :param list(TopicPartition) partitions: topic+partitions with timestamps
+ in the TopicPartition.offset field.
+ :param float timeout: The maximum response time before timing out.
+ None or -1 is infinite. Default: None
+ :returns: List of topic+partition with offset field set and possibly error set
+ :rtype: list(TopicPartition)
+ :raises: KafkaException
+ :raises: RuntimeError if called on a closed consumer
-Example Snippet:
-```python
-from quixstreams import Application, message_context
+
-# Changes the current sdf value based on what the message partition is.
+#### Consumer.pause
-app = Application()
-sdf = app.dataframe()
-sdf = sdf.apply(lambda value: 1 if message_context().partition == 2 else 0)
+```python
+def pause(partitions: List[TopicPartition])
```
-**Returns**:
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L428)
-instance of `MessageContext`
+Pause consumption for the provided list of partitions.
-
+Paused partitions must be tracked manually.
-#### message\_key
+Does NOT affect the result of Consumer.assignment().
-```python
-def message_key() -> Any
-```
+**Arguments**:
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/context.py#L83)
+- `partitions` (`list(TopicPartition)`): List of topic+partitions to pause.
-Get the current message's key.
+**Raises**:
-Example Snippet:
+- `None`: KafkaException
-```python
-from quixstreams import Application, message_key
+
-# Changes the current sdf value based on what the message key is.
+#### Consumer.resume
-app = Application()
-sdf = app.dataframe()
-sdf = sdf.apply(lambda value: 1 if message_key() == b'1' else 0)
+```python
+def resume(partitions: List[TopicPartition])
```
-**Returns**:
-
-a deserialized message key
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L442)
-
+.. py:function:: resume(partitions)
-## quixstreams.rowconsumer
+Resume consumption for the provided list of partitions.
-
+**Arguments**:
-### RowConsumer
+- `partitions` (`list(TopicPartition)`): List of topic+partitions to resume.
-```python
-class RowConsumer(Consumer, RowConsumerProto)
-```
+**Raises**:
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/rowconsumer.py#L57)
+- `None`: KafkaException
-
+
-#### RowConsumer.\_\_init\_\_
+#### Consumer.position
```python
-def __init__(broker_address: str,
- consumer_group: str,
- auto_offset_reset: AutoOffsetReset,
- auto_commit_enable: bool = True,
- assignment_strategy: AssignmentStrategy = "range",
- on_commit: Callable[[Optional[KafkaError], List[TopicPartition]],
- None] = None,
- extra_config: Optional[dict] = None,
- on_error: Optional[ConsumerErrorCallback] = None)
+def position(partitions: List[TopicPartition]) -> List[TopicPartition]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/rowconsumer.py#L58)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L454)
-A consumer class that is capable of deserializing Kafka messages to Rows
+Retrieve current positions (offsets) for the specified partitions.
-according to the Topics deserialization settings.
+**Arguments**:
-It overrides `.subscribe()` method of Consumer class to accept `Topic`
-objects instead of strings.
+- `partitions` (`list(TopicPartition)`): List of topic+partitions to return
+current offsets for. The current offset is the offset of
+the last consumed message + 1.
-**Arguments**:
+**Raises**:
-- `broker_address`: Kafka broker host and port in format `:`.
-Passed as `bootstrap.servers` to `confluent_kafka.Consumer`.
-- `consumer_group`: Kafka consumer group.
-Passed as `group.id` to `confluent_kafka.Consumer`
-- `auto_offset_reset`: Consumer `auto.offset.reset` setting.
-Available values:
-- "earliest" - automatically reset the offset to the smallest offset
-- "latest" - automatically reset the offset to the largest offset
-- `auto_commit_enable`: If true, periodically commit offset of
-the last message handed to the application. Default - `True`.
-- `assignment_strategy`: The name of a partition assignment strategy.
-Available values: "range", "roundrobin", "cooperative-sticky".
-- `on_commit`: Offset commit result propagation callback.
-Passed as "offset_commit_cb" to `confluent_kafka.Consumer`.
-- `extra_config`: A dictionary with additional options that
-will be passed to `confluent_kafka.Consumer` as is.
-Note: values passed as arguments override values in `extra_config`.
-- `on_error`: a callback triggered when `RowConsumer.poll_row` fails.
-If consumer fails and the callback returns `True`, the exception
-will be logged but not propagated.
-The default callback logs an exception and returns `False`.
+- `None`: KafkaException
+- `None`: RuntimeError if called on a closed consumer
-
+**Returns**:
-#### RowConsumer.subscribe
+`list(TopicPartition)`: List of topic+partitions with offset and possibly error set.
+
+
+
+#### Consumer.seek
```python
-def subscribe(topics: List[Topic],
- on_assign: Optional[RebalancingCallback] = None,
- on_revoke: Optional[RebalancingCallback] = None,
- on_lost: Optional[RebalancingCallback] = None)
+def seek(partition: TopicPartition)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/rowconsumer.py#L113)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L468)
-Set subscription to supplied list of topics.
+Set consume position for partition to offset.
-This replaces a previous subscription.
+The offset may be an absolute (>=0) or a
+logical offset (:py:const:`OFFSET_BEGINNING` et.al).
-This method also updates the internal mapping with topics that is used
-to deserialize messages to Rows.
+seek() may only be used to update the consume offset of an
+actively consumed partition (i.e., after :py:const:`assign()`),
+to set the starting offset of partition not being consumed instead
+pass the offset in an `assign()` call.
**Arguments**:
-- `topics`: list of `Topic` instances to subscribe to.
-- `on_assign` (`callable`): callback to provide handling of customized offsets
-on completion of a successful partition re-assignment.
-- `on_revoke` (`callable`): callback to provide handling of offset commits to
-a customized store on the start of a rebalance operation.
-- `on_lost` (`callable`): callback to provide handling in the case the partition
-assignment has been lost. Partitions that have been lost may already be
-owned by other members in the group and therefore committing offsets,
-for example, may fail.
+- `partition` (`TopicPartition`): Topic+partition+offset to seek to.
-
+**Raises**:
-#### RowConsumer.poll\_row
+- `None`: KafkaException
+
+
+
+#### Consumer.assignment
```python
-def poll_row(timeout: float = None) -> Union[Row, List[Row], None]
+def assignment() -> List[TopicPartition]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/rowconsumer.py#L147)
-
-Consumes a single message and deserialize it to Row or a list of Rows.
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L485)
-The message is deserialized according to the corresponding Topic.
-If deserializer raises `IgnoreValue` exception, this method will return None.
-If Kafka returns an error, it will be raised as exception.
+Returns the current partition assignment.
-**Arguments**:
+**Raises**:
-- `timeout`: poll timeout seconds
+- `None`: KafkaException
+- `None`: RuntimeError if called on a closed consumer
**Returns**:
-single Row, list of Rows or None
+`list(TopicPartition)`: List of assigned topic+partitions.
+
+
+
+#### Consumer.set\_sasl\_credentials
+
+```python
+def set_sasl_credentials(username: str, password: str)
+```
-
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L498)
-## quixstreams.rowproducer
+Sets the SASL credentials used for this client.
+These credentials will overwrite the old ones, and will be used the next
+time the client needs to authenticate.
+This method will not disconnect existing broker connections that have been
+established with the old credentials.
+This method is applicable only to SASL PLAIN and SCRAM mechanisms.
-
+
-### RowProducer
+#### Consumer.incremental\_assign
```python
-class RowProducer(Producer, RowProducerProto)
+def incremental_assign(partitions: List[TopicPartition])
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/rowproducer.py#L24)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L510)
-A producer class that is capable of serializing Rows to bytes and send them to Kafka.
-
-The serialization is performed according to the Topic serialization settings.
+Assign new partitions.
- It overrides `.subscribe()` method of Consumer class to accept `Topic`
- objects instead of strings.
-
- :param broker_address: Kafka broker host and port in format `:`.
- Passed as `bootstrap.servers` to `confluent_kafka.Producer`.
- :param partitioner: A function to be used to determine the outgoing message
- partition.
- Available values: "random", "consistent_random", "murmur2", "murmur2_random",
- "fnv1a", "fnv1a_random"
- Default - "murmur2".
- :param extra_config: A dictionary with additional options that
- will be passed to `confluent_kafka.Producer` as is.
- Note: values passed as arguments override values in `extra_config`.
- :param on_error: a callback triggered when `RowProducer.produce_row()`
- or `RowProducer.poll()` fail`.
- If producer fails and the callback returns `True`, the exception
- will be logged but not propagated.
- The default callback logs an exception and returns `False`.
+Can be called outside the `Consumer` `on_assign` callback (multiple times).
+Partitions immediately show on `Consumer.assignment()`.
+Any additional partitions besides the ones passed during the `Consumer`
+`on_assign` callback will NOT be associated with the consumer group.
-
+
-#### RowProducer.produce\_row
+#### Consumer.incremental\_unassign
```python
-def produce_row(row: Row,
- topic: Topic,
- key: Optional[Any] = None,
- partition: Optional[int] = None,
- timestamp: Optional[int] = None)
+def incremental_unassign(partitions: List[TopicPartition])
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/rowproducer.py#L65)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L522)
-Serialize Row to bytes according to the Topic serialization settings
+Revoke partitions.
-and produce it to Kafka
+Can be called outside an on_revoke callback.
-If this method fails, it will trigger the provided "on_error" callback.
+
-**Arguments**:
+#### Consumer.close
-- `row`: Row object
-- `topic`: Topic object
-- `key`: message key, optional
-- `partition`: partition number, optional
-- `timestamp`: timestamp in milliseconds, optional
+```python
+def close()
+```
-
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L530)
-#### RowProducer.poll
+Close down and terminate the Kafka Consumer.
-```python
-def poll(timeout: float = None)
-```
+Actions performed:
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/rowproducer.py#L102)
+- Stops consuming.
+- Commits offsets, unless the consumer property 'enable.auto.commit' is set to False.
+- Leaves the consumer group.
-Polls the producer for events and calls `on_delivery` callbacks.
+Registered callbacks may be called from this method,
+see `poll()` for more info.
-If poll fails, it will trigger the provided "on_error" callback
-**Arguments**:
+
-- `timeout`: timeout in seconds
+## quixstreams.kafka.exceptions
@@ -6950,7 +7240,7 @@ If poll fails, it will trigger the provided "on_error" callback
class Application()
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/app.py#L55)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/app.py#L59)
The main Application class.
@@ -6996,7 +7286,7 @@ def __init__(broker_address: Optional[str] = None,
quix_sdk_token: Optional[str] = None,
consumer_group: Optional[str] = None,
auto_offset_reset: AutoOffsetReset = "latest",
- auto_commit_enable: bool = True,
+ commit_interval: float = 5.0,
partitioner: Partitioner = "murmur2",
consumer_extra_config: Optional[dict] = None,
producer_extra_config: Optional[dict] = None,
@@ -7015,7 +7305,7 @@ def __init__(broker_address: Optional[str] = None,
topic_manager: Optional[TopicManager] = None)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/app.py#L93)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/app.py#L97)
**Arguments**:
@@ -7034,9 +7324,9 @@ Passed as `group.id` to `confluent_kafka.Consumer`.
Linked Environment Variable: `Quix__Consumer__Group`.
Default - "quixstreams-default" (set during init)
>***NOTE:*** Quix Applications will prefix it with the Quix workspace id.
+- `commit_interval`: How often to commit the processed messages in seconds.
+Default - 5.0.
- `auto_offset_reset`: Consumer `auto.offset.reset` setting
-- `auto_commit_enable`: If true, periodically commit offset of
-the last message handed to the application. Default - `True`.
- `partitioner`: A function to be used to determine the outgoing message
partition.
- `consumer_extra_config`: A dictionary with additional options that
@@ -7087,7 +7377,6 @@ instead of the default one.
def Quix(cls,
consumer_group: Optional[str] = None,
auto_offset_reset: AutoOffsetReset = "latest",
- auto_commit_enable: bool = True,
partitioner: Partitioner = "murmur2",
consumer_extra_config: Optional[dict] = None,
producer_extra_config: Optional[dict] = None,
@@ -7106,7 +7395,7 @@ def Quix(cls,
topic_manager: Optional[QuixTopicManager] = None) -> Self
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/app.py#L296)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/app.py#L303)
>***NOTE:*** DEPRECATED: use Application with `quix_sdk_token` argument instead.
@@ -7150,8 +7439,6 @@ Linked Environment Variable: `Quix__Consumer__Group`.
Default - "quixstreams-default" (set during init).
>***NOTE:*** Quix Applications will prefix it with the Quix workspace id.
- `auto_offset_reset`: Consumer `auto.offset.reset` setting
-- `auto_commit_enable`: If true, periodically commit offset of
-the last message handed to the application. Default - `True`.
- `partitioner`: A function to be used to determine the outgoing message
partition.
- `consumer_extra_config`: A dictionary with additional options that
@@ -7210,7 +7497,7 @@ def topic(name: str,
timestamp_extractor: Optional[TimestampExtractor] = None) -> Topic
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/app.py#L436)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/app.py#L439)
Create a topic definition.
@@ -7281,7 +7568,7 @@ topic = app.topic("input-topic", timestamp_extractor=custom_ts_extractor)
def dataframe(topic: Topic) -> StreamingDataFrame
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/app.py#L516)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/app.py#L519)
A simple helper method that generates a `StreamingDataFrame`, which is used
@@ -7320,10 +7607,10 @@ to be used as an input topic.
#### Application.stop
```python
-def stop()
+def stop(fail: bool = False)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/app.py#L552)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/app.py#L556)
Stop the internal poll loop and the message processing.
@@ -7333,6 +7620,11 @@ likely through some sort of threading).
To otherwise stop an application, either send a `SIGTERM` to the process
(like Kubernetes does) or perform a typical `KeyboardInterrupt` (`Ctrl+C`).
+**Arguments**:
+
+- `fail`: if True, signals that application is stopped due
+to unhandled exception, and it shouldn't commit the current checkpoint.
+
#### Application.get\_producer
@@ -7341,7 +7633,7 @@ To otherwise stop an application, either send a `SIGTERM` to the process
def get_producer() -> Producer
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/app.py#L566)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/app.py#L579)
Create and return a pre-configured Producer instance.
The Producer is initialized with params passed to Application.
@@ -7369,10 +7661,10 @@ with app.get_producer() as producer:
#### Application.get\_consumer
```python
-def get_consumer() -> Consumer
+def get_consumer(auto_commit_enable: bool = True) -> Consumer
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/app.py#L597)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/app.py#L610)
Create and return a pre-configured Consumer instance.
The Consumer is initialized with params passed to Application.
@@ -7413,7 +7705,7 @@ with app.get_consumer() as consumer:
def clear_state()
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/app.py#L641)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/app.py#L654)
Clear the state of the application.
@@ -7425,11 +7717,11 @@ Clear the state of the application.
def run(dataframe: StreamingDataFrame)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/app.py#L719)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/app.py#L660)
Start processing data from Kafka using provided `StreamingDataFrame`
-One started, can be safely terminated with a `SIGTERM` signal
+Once started, it can be safely terminated with a `SIGTERM` signal
(like Kubernetes does) or a typical `KeyboardInterrupt` (`Ctrl+C`).
@@ -7453,3 +7745,235 @@ app.run(dataframe=df)
- `dataframe`: instance of `StreamingDataFrame`
+
+
+## quixstreams.rowconsumer
+
+
+
+### RowConsumer
+
+```python
+class RowConsumer(Consumer)
+```
+
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/rowconsumer.py#L19)
+
+
+
+#### RowConsumer.\_\_init\_\_
+
+```python
+def __init__(broker_address: str,
+ consumer_group: str,
+ auto_offset_reset: AutoOffsetReset,
+ auto_commit_enable: bool = True,
+ assignment_strategy: AssignmentStrategy = "range",
+ on_commit: Callable[[Optional[KafkaError], List[TopicPartition]],
+ None] = None,
+ extra_config: Optional[dict] = None,
+ on_error: Optional[ConsumerErrorCallback] = None)
+```
+
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/rowconsumer.py#L20)
+
+A consumer class that is capable of deserializing Kafka messages to Rows
+
+according to the Topics deserialization settings.
+
+It overrides `.subscribe()` method of Consumer class to accept `Topic`
+objects instead of strings.
+
+**Arguments**:
+
+- `broker_address`: Kafka broker host and port in format `:`.
+Passed as `bootstrap.servers` to `confluent_kafka.Consumer`.
+- `consumer_group`: Kafka consumer group.
+Passed as `group.id` to `confluent_kafka.Consumer`
+- `auto_offset_reset`: Consumer `auto.offset.reset` setting.
+Available values:
+- "earliest" - automatically reset the offset to the smallest offset
+- "latest" - automatically reset the offset to the largest offset
+- `auto_commit_enable`: If true, periodically commit offset of
+the last message handed to the application. Default - `True`.
+- `assignment_strategy`: The name of a partition assignment strategy.
+Available values: "range", "roundrobin", "cooperative-sticky".
+- `on_commit`: Offset commit result propagation callback.
+Passed as "offset_commit_cb" to `confluent_kafka.Consumer`.
+- `extra_config`: A dictionary with additional options that
+will be passed to `confluent_kafka.Consumer` as is.
+Note: values passed as arguments override values in `extra_config`.
+- `on_error`: a callback triggered when `RowConsumer.poll_row` fails.
+If consumer fails and the callback returns `True`, the exception
+will be logged but not propagated.
+The default callback logs an exception and returns `False`.
+
+
+
+#### RowConsumer.subscribe
+
+```python
+def subscribe(topics: List[Topic],
+ on_assign: Optional[RebalancingCallback] = None,
+ on_revoke: Optional[RebalancingCallback] = None,
+ on_lost: Optional[RebalancingCallback] = None)
+```
+
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/rowconsumer.py#L75)
+
+Set subscription to supplied list of topics.
+
+This replaces a previous subscription.
+
+This method also updates the internal mapping with topics that is used
+to deserialize messages to Rows.
+
+**Arguments**:
+
+- `topics`: list of `Topic` instances to subscribe to.
+- `on_assign` (`callable`): callback to provide handling of customized offsets
+on completion of a successful partition re-assignment.
+- `on_revoke` (`callable`): callback to provide handling of offset commits to
+a customized store on the start of a rebalance operation.
+- `on_lost` (`callable`): callback to provide handling in the case the partition
+assignment has been lost. Partitions that have been lost may already be
+owned by other members in the group and therefore committing offsets,
+for example, may fail.
+
+
+
+#### RowConsumer.poll\_row
+
+```python
+def poll_row(timeout: float = None) -> Union[Row, List[Row], None]
+```
+
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/rowconsumer.py#L109)
+
+Consumes a single message and deserialize it to Row or a list of Rows.
+
+The message is deserialized according to the corresponding Topic.
+If deserializer raises `IgnoreValue` exception, this method will return None.
+If Kafka returns an error, it will be raised as exception.
+
+**Arguments**:
+
+- `timeout`: poll timeout seconds
+
+**Returns**:
+
+single Row, list of Rows or None
+
+
+
+## quixstreams.checkpointing.checkpoint
+
+
+
+### Checkpoint
+
+```python
+class Checkpoint()
+```
+
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/checkpointing/checkpoint.py#L20)
+
+Class to keep track of state updates and consumer offsets and to checkpoint these
+updates on schedule.
+
+
+
+#### Checkpoint.expired
+
+```python
+def expired() -> bool
+```
+
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/checkpointing/checkpoint.py#L45)
+
+Returns `True` if checkpoint deadline has expired.
+
+
+
+#### Checkpoint.empty
+
+```python
+def empty() -> bool
+```
+
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/checkpointing/checkpoint.py#L51)
+
+Returns `True` if checkpoint doesn't have any offsets stored yet.
+
+
+
+
+#### Checkpoint.store\_offset
+
+```python
+def store_offset(topic: str, partition: int, offset: int)
+```
+
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/checkpointing/checkpoint.py#L58)
+
+Store the offset of the processed message to the checkpoint.
+
+**Arguments**:
+
+- `topic`: topic name
+- `partition`: partition number
+- `offset`: message offset
+
+
+
+#### Checkpoint.get\_store\_transaction
+
+```python
+def get_store_transaction(
+ topic: str,
+ partition: int,
+ store_name: str = DEFAULT_STATE_STORE_NAME) -> PartitionTransaction
+```
+
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/checkpointing/checkpoint.py#L78)
+
+Get a PartitionTransaction for the given store, topic and partition.
+
+It will return already started transaction if there's one.
+
+**Arguments**:
+
+- `topic`: topic name
+- `partition`: partition number
+- `store_name`: store name
+
+**Returns**:
+
+instance of `PartitionTransaction`
+
+
+
+#### Checkpoint.commit
+
+```python
+def commit()
+```
+
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/checkpointing/checkpoint.py#L101)
+
+Commit the checkpoint.
+
+This method will:
+ 1. Produce the changelogs for each state store
+ 2. Flush the producer to ensure everything is delivered.
+ 3. Commit topic offsets.
+ 4. Flush each state store partition to the disk.
+
+
+
+## quixstreams.checkpointing
+
+
+
+## quixstreams.checkpointing.exceptions
+
diff --git a/docs/api-reference/serialization.md b/docs/api-reference/serialization.md
index 5c40047f3..691daf043 100644
--- a/docs/api-reference/serialization.md
+++ b/docs/api-reference/serialization.md
@@ -10,7 +10,7 @@
class QuixDeserializer(JSONDeserializer)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/quix.py#L73)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/quix.py#L73)
Handles Deserialization for any Quix-formatted topic.
@@ -27,7 +27,7 @@ def __init__(column_name: Optional[str] = None,
loads: Callable[[Union[bytes, bytearray]], Any] = default_loads)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/quix.py#L80)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/quix.py#L80)
@@ -49,7 +49,7 @@ Default - :py:func:`quixstreams.utils.json.loads`.
def split_values() -> bool
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/quix.py#L100)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/quix.py#L100)
Each Quix message might contain data for multiple Rows.
This property informs the downstream processors about that, so they can
@@ -66,7 +66,7 @@ def deserialize(model_key: str, value: Union[List[Mapping],
Mapping]) -> Iterable[Mapping]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/quix.py#L153)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/quix.py#L153)
Deserialization function for particular data types (Timeseries or EventData).
@@ -91,7 +91,7 @@ Iterable of dicts
class QuixTimeseriesSerializer(QuixSerializer)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/quix.py#L321)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/quix.py#L321)
Serialize data to JSON formatted according to Quix Timeseries format.
@@ -123,7 +123,7 @@ Output:
class QuixEventsSerializer(QuixSerializer)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/quix.py#L409)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/quix.py#L409)
Serialize data to JSON formatted according to Quix EventData format.
The input value is expected to be a dictionary with the following keys:
@@ -164,7 +164,7 @@ Output:
class BytesDeserializer(Deserializer)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/simple_types.py#L44)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/simple_types.py#L44)
A deserializer to bypass bytes without any changes
@@ -176,7 +176,7 @@ A deserializer to bypass bytes without any changes
class BytesSerializer(Serializer)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/simple_types.py#L55)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/simple_types.py#L55)
A serializer to bypass bytes without any changes
@@ -188,7 +188,7 @@ A serializer to bypass bytes without any changes
class StringDeserializer(Deserializer)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/simple_types.py#L64)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/simple_types.py#L64)
@@ -200,7 +200,7 @@ class StringDeserializer(Deserializer)
def __init__(column_name: Optional[str] = None, codec: str = "utf_8")
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/simple_types.py#L65)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/simple_types.py#L65)
Deserializes bytes to strings using the specified encoding.
@@ -219,7 +219,7 @@ A wrapper around `confluent_kafka.serialization.StringDeserializer`.
class IntegerDeserializer(Deserializer)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/simple_types.py#L84)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/simple_types.py#L84)
Deserializes bytes to integers.
@@ -233,7 +233,7 @@ A wrapper around `confluent_kafka.serialization.IntegerDeserializer`.
class DoubleDeserializer(Deserializer)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/simple_types.py#L103)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/simple_types.py#L103)
Deserializes float to IEEE 764 binary64.
@@ -247,7 +247,7 @@ A wrapper around `confluent_kafka.serialization.DoubleDeserializer`.
class StringSerializer(Serializer)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/simple_types.py#L122)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/simple_types.py#L122)
@@ -259,7 +259,7 @@ class StringSerializer(Serializer)
def __init__(codec: str = "utf_8")
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/simple_types.py#L123)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/simple_types.py#L123)
Serializes strings to bytes using the specified encoding.
@@ -277,7 +277,7 @@ Serializes strings to bytes using the specified encoding.
class IntegerSerializer(Serializer)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/simple_types.py#L135)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/simple_types.py#L135)
Serializes integers to bytes
@@ -289,7 +289,7 @@ Serializes integers to bytes
class DoubleSerializer(Serializer)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/simple_types.py#L148)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/simple_types.py#L148)
Serializes floats to bytes
diff --git a/docs/api-reference/state.md b/docs/api-reference/state.md
index df6a283de..b9cf51c76 100644
--- a/docs/api-reference/state.md
+++ b/docs/api-reference/state.md
@@ -10,7 +10,7 @@
class State(Protocol)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L151)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L136)
Primary interface for working with key-value state data from `StreamingDataFrame`
@@ -24,7 +24,7 @@ Primary interface for working with key-value state data from `StreamingDataFrame
def get(key: Any, default: Any = None) -> Optional[Any]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L156)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L141)
Get the value for key if key is present in the state, else default
@@ -51,7 +51,7 @@ value or None if the key is not found and `default` is not provided
def set(key: Any, value: Any)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L166)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L151)
Set value for the key.
@@ -72,7 +72,7 @@ Set value for the key.
def delete(key: Any)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L174)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L159)
Delete value for the key.
@@ -94,7 +94,7 @@ This function always returns `None`, even if value is not found.
def exists(key: Any) -> bool
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L183)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L168)
Check if the key exists in state.
@@ -123,7 +123,7 @@ True if key exists, False otherwise
class RocksDBOptions(RocksDBOptionsType)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/options.py#L25)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/options.py#L25)
RocksDB database options.
@@ -148,7 +148,7 @@ Please see `rocksdict.Options` for a complete description of other options.
def to_options() -> rocksdict.Options
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/options.py#L53)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/options.py#L53)
Convert parameters to `rocksdict.Options`
diff --git a/docs/api-reference/topics.md b/docs/api-reference/topics.md
index fa90bd1fa..22c500f84 100644
--- a/docs/api-reference/topics.md
+++ b/docs/api-reference/topics.md
@@ -16,7 +16,7 @@
def convert_topic_list(topics: List[Topic]) -> List[ConfluentTopic]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/admin.py#L23)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/admin.py#L23)
Converts `Topic`s to `ConfluentTopic`s as required for Confluent's
@@ -42,7 +42,7 @@ list of confluent_kafka `ConfluentTopic`s
class TopicAdmin()
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/admin.py#L46)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/admin.py#L46)
For performing "admin"-level operations on a Kafka cluster, mostly around topics.
@@ -58,7 +58,7 @@ Primarily used to create and inspect topic configurations.
def __init__(broker_address: str, extra_config: Optional[Mapping] = None)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/admin.py#L53)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/admin.py#L53)
@@ -77,7 +77,7 @@ def __init__(broker_address: str, extra_config: Optional[Mapping] = None)
def list_topics() -> Dict[str, ConfluentTopicMetadata]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/admin.py#L74)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/admin.py#L74)
Get a list of topics and their metadata from a Kafka cluster
@@ -97,7 +97,7 @@ a dict of topic names and their metadata objects
def inspect_topics(topic_names: List[str]) -> Dict[str, Optional[TopicConfig]]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/admin.py#L83)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/admin.py#L83)
A simplified way of getting the topic configurations of the provided topics
@@ -127,7 +127,7 @@ def create_topics(topics: List[Topic],
finalize_timeout: int = 60)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/admin.py#L156)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/admin.py#L156)
Create the given list of topics and confirm they are ready.
@@ -155,7 +155,7 @@ fail (it ignores issues for a topic already existing).
class TopicConfig()
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/topic.py#L43)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/topic.py#L43)
Represents all kafka-level configuration for a kafka topic.
@@ -169,7 +169,7 @@ Generally used by Topic and any topic creation procedures.
class Topic()
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/topic.py#L84)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/topic.py#L84)
A definition of a Kafka topic.
@@ -194,7 +194,7 @@ def __init__(
timestamp_extractor: Optional[TimestampExtractor] = None)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/topic.py#L93)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/topic.py#L93)
@@ -220,7 +220,7 @@ milliseconds from a deserialized message.
def name() -> str
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/topic.py#L122)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/topic.py#L122)
Topic name
@@ -234,7 +234,7 @@ Topic name
def row_serialize(row: Row, key: Optional[Any] = None) -> KafkaMessage
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/topic.py#L132)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/topic.py#L132)
Serialize Row to a Kafka message structure
@@ -262,7 +262,7 @@ def row_deserialize(
message: ConfluentKafkaMessageProto) -> Union[Row, List[Row], None]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/topic.py#L155)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/topic.py#L155)
Deserialize incoming Kafka message to a Row.
@@ -292,7 +292,7 @@ Row, list of Rows or None if the message is ignored.
def affirm_ready_for_create(topics: List[Topic])
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/manager.py#L19)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/manager.py#L19)
Validate a list of topics is ready for creation attempt
@@ -310,7 +310,7 @@ Validate a list of topics is ready for creation attempt
class TopicManager()
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/manager.py#L29)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/manager.py#L29)
The source of all topic management with quixstreams.
@@ -330,7 +330,7 @@ See methods for details.
def __init__(topic_admin: TopicAdmin, create_timeout: int = 60)
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/manager.py#L48)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/manager.py#L48)
@@ -350,7 +350,7 @@ def __init__(topic_admin: TopicAdmin, create_timeout: int = 60)
def changelog_topics() -> Dict[str, Dict[str, Topic]]
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/manager.py#L71)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/manager.py#L71)
Note: `Topic`s are the changelogs.
@@ -368,7 +368,7 @@ def topic_config(num_partitions: Optional[int] = None,
extra_config: Optional[dict] = None) -> TopicConfig
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/manager.py#L121)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/manager.py#L119)
Convenience method for generating a `TopicConfig` with default settings
@@ -402,7 +402,7 @@ def topic(name: str,
timestamp_extractor: Optional[TimestampExtractor] = None) -> Topic
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/manager.py#L142)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/manager.py#L140)
A convenience method for generating a `Topic`. Will use default config options
@@ -438,7 +438,7 @@ def changelog_topic(topic_name: str, store_name: str,
consumer_group: str) -> Topic
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/manager.py#L191)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/manager.py#L189)
Performs all the logic necessary to generate a changelog topic based on a
@@ -483,7 +483,7 @@ generate changelog topics. To turn off changelogs, init an Application with
def create_topics(topics: List[Topic])
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/manager.py#L262)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/manager.py#L260)
Creates topics via an explicit list of provided `Topics`.
@@ -506,7 +506,7 @@ Exists as a way to manually specify what topics to create; otherwise,
def create_all_topics()
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/manager.py#L277)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/manager.py#L275)
A convenience method to create all Topic objects stored on this TopicManager.
@@ -520,7 +520,7 @@ A convenience method to create all Topic objects stored on this TopicManager.
def validate_all_topics()
```
-[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/manager.py#L283)
+[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/manager.py#L281)
Validates all topics exist and changelogs have correct topic and rep factor.
diff --git a/quixstreams/app.py b/quixstreams/app.py
index 9f608dc98..14b6fdb46 100644
--- a/quixstreams/app.py
+++ b/quixstreams/app.py
@@ -40,6 +40,7 @@
check_state_management_enabled,
QuixTopicManager,
)
+from .processing_context import ProcessingContext
from .rowconsumer import RowConsumer
from .rowproducer import RowProducer
from .state import StateStoreManager
@@ -51,6 +52,9 @@
logger = logging.getLogger(__name__)
MessageProcessedCallback = Callable[[str, int, int], None]
+# Enforce idempotent producing for the internal RowProducer
+_default_producer_extra_config = {"enable.idempotence": True}
+
class Application:
"""
@@ -96,7 +100,7 @@ def __init__(
quix_sdk_token: Optional[str] = None,
consumer_group: Optional[str] = None,
auto_offset_reset: AutoOffsetReset = "latest",
- auto_commit_enable: bool = True,
+ commit_interval: float = 5.0,
partitioner: Partitioner = "murmur2",
consumer_extra_config: Optional[dict] = None,
producer_extra_config: Optional[dict] = None,
@@ -130,9 +134,9 @@ def __init__(
Linked Environment Variable: `Quix__Consumer__Group`.
Default - "quixstreams-default" (set during init)
>***NOTE:*** Quix Applications will prefix it with the Quix workspace id.
+ :param commit_interval: How often to commit the processed messages in seconds.
+ Default - 5.0.
:param auto_offset_reset: Consumer `auto.offset.reset` setting
- :param auto_commit_enable: If true, periodically commit offset of
- the last message handed to the application. Default - `True`.
:param partitioner: A function to be used to determine the outgoing message
partition.
:param consumer_extra_config: A dictionary with additional options that
@@ -176,6 +180,15 @@ def __init__(
> NOTE: It is recommended to just use `quix_sdk_token` instead.
"""
configure_logging(loglevel=loglevel)
+ producer_extra_config = producer_extra_config or {}
+ consumer_extra_config = consumer_extra_config or {}
+
+ # Add default values to the producer config, but allow them to be overwritten
+ # by the provided producer_extra_config dict
+ producer_extra_config = {
+ **_default_producer_extra_config,
+ **producer_extra_config,
+ }
# We can't use os.getenv as defaults (and have testing work nicely)
# since it evaluates getenv when the function is defined.
@@ -213,14 +226,13 @@ def __init__(
)
quix_configs = quix_config_builder.get_confluent_broker_config()
# Check if the state dir points to the mounted PVC while running on Quix
- # TODO: Do we still need this?
check_state_dir(state_dir=state_dir)
broker_address = quix_configs.pop("bootstrap.servers")
# Quix Cloud prefixes consumer group with workspace id
consumer_group = quix_config_builder.prepend_workspace_id(consumer_group)
- consumer_extra_config = {**quix_configs, **(consumer_extra_config or {})}
- producer_extra_config = {**quix_configs, **(producer_extra_config or {})}
+ consumer_extra_config = {**quix_configs, **consumer_extra_config}
+ producer_extra_config = {**quix_configs, **producer_extra_config}
else:
# Only broker address is provided
topic_manager_factory = TopicManager
@@ -230,16 +242,15 @@ def __init__(
self._broker_address = broker_address
self._consumer_group = consumer_group
self._auto_offset_reset = auto_offset_reset
- self._auto_commit_enable = auto_commit_enable
self._partitioner = partitioner
+ self._commit_interval = commit_interval
self._producer_extra_config = producer_extra_config
self._consumer_extra_config = consumer_extra_config
-
self._consumer = RowConsumer(
broker_address=broker_address,
consumer_group=consumer_group,
auto_offset_reset=auto_offset_reset,
- auto_commit_enable=auto_commit_enable,
+ auto_commit_enable=False, # Disable auto commit and manage commits manually
assignment_strategy="cooperative-sticky",
extra_config=consumer_extra_config,
on_error=on_consumer_error,
@@ -253,11 +264,11 @@ def __init__(
self._consumer_poll_timeout = consumer_poll_timeout
self._producer_poll_timeout = producer_poll_timeout
- self._running = False
self._on_processing_error = on_processing_error or default_on_processing_error
self._on_message_processed = on_message_processed
self._auto_create_topics = auto_create_topics
- self._do_recovery_check = False
+ self._running = False
+ self._failed = False
if not topic_manager:
topic_manager = topic_manager_factory(
@@ -267,21 +278,11 @@ def __init__(
)
)
self._topic_manager = topic_manager
-
self._state_manager = StateStoreManager(
group_id=consumer_group,
state_dir=state_dir,
rocksdb_options=rocksdb_options,
- producer=(
- RowProducer(
- broker_address=broker_address,
- partitioner=partitioner,
- extra_config=producer_extra_config,
- on_error=on_producer_error,
- )
- if use_changelog_topics
- else None
- ),
+ producer=self._producer if use_changelog_topics else None,
recovery_manager=(
RecoveryManager(
consumer=self._consumer,
@@ -291,13 +292,18 @@ def __init__(
else None
),
)
+ self._processing_context = ProcessingContext(
+ commit_interval=self._commit_interval,
+ producer=self._producer,
+ consumer=self._consumer,
+ state_manager=self._state_manager,
+ )
@classmethod
def Quix(
cls,
consumer_group: Optional[str] = None,
auto_offset_reset: AutoOffsetReset = "latest",
- auto_commit_enable: bool = True,
partitioner: Partitioner = "murmur2",
consumer_extra_config: Optional[dict] = None,
producer_extra_config: Optional[dict] = None,
@@ -356,8 +362,6 @@ def Quix(
Default - "quixstreams-default" (set during init).
>***NOTE:*** Quix Applications will prefix it with the Quix workspace id.
:param auto_offset_reset: Consumer `auto.offset.reset` setting
- :param auto_commit_enable: If true, periodically commit offset of
- the last message handed to the application. Default - `True`.
:param partitioner: A function to be used to determine the outgoing message
partition.
:param consumer_extra_config: A dictionary with additional options that
@@ -415,7 +419,6 @@ def Quix(
consumer_extra_config=consumer_extra_config,
producer_extra_config=producer_extra_config,
auto_offset_reset=auto_offset_reset,
- auto_commit_enable=auto_commit_enable,
partitioner=partitioner,
on_consumer_error=on_consumer_error,
on_processing_error=on_processing_error,
@@ -545,11 +548,12 @@ def dataframe(
to be used as an input topic.
:return: `StreamingDataFrame` object
"""
- sdf = StreamingDataFrame(topic=topic, state_manager=self._state_manager)
- sdf.producer = self._producer
+ sdf = StreamingDataFrame(
+ topic=topic, processing_context=self._processing_context
+ )
return sdf
- def stop(self):
+ def stop(self, fail: bool = False):
"""
Stop the internal poll loop and the message processing.
@@ -558,8 +562,17 @@ def stop(self):
To otherwise stop an application, either send a `SIGTERM` to the process
(like Kubernetes does) or perform a typical `KeyboardInterrupt` (`Ctrl+C`).
+
+ :param fail: if True, signals that application is stopped due
+ to unhandled exception, and it shouldn't commit the current checkpoint.
"""
+
self._running = False
+ if fail:
+ # Update "_failed" only when fail=True to prevent stop(failed=False) from
+ # resetting it
+ self._failed = True
+
if self._state_manager.using_changelogs:
self._state_manager.stop_recovery()
@@ -594,7 +607,7 @@ def get_producer(self) -> Producer:
extra_config=self._producer_extra_config,
)
- def get_consumer(self) -> Consumer:
+ def get_consumer(self, auto_commit_enable: bool = True) -> Consumer:
"""
Create and return a pre-configured Consumer instance.
The Consumer is initialized with params passed to Application.
@@ -633,7 +646,7 @@ def get_consumer(self) -> Consumer:
broker_address=self._broker_address,
consumer_group=self._consumer_group,
auto_offset_reset=self._auto_offset_reset,
- auto_commit_enable=self._auto_commit_enable,
+ auto_commit_enable=auto_commit_enable,
assignment_strategy="cooperative-sticky",
extra_config=self._consumer_extra_config,
)
@@ -644,78 +657,6 @@ def clear_state(self):
"""
self._state_manager.clear_stores()
- def _quix_runtime_init(self):
- """
- Do a runtime setup only applicable to an Application.Quix instance
- - Ensure that "State management" flag is enabled for deployment if the app
- is stateful and is running in Quix Cloud
- """
- # Ensure that state management is enabled if application is stateful
- if self._state_manager.stores:
- check_state_management_enabled()
-
- def _setup_topics(self):
- topics_list = ", ".join(
- f'"{topic.name}"' for topic in self._topic_manager.all_topics
- )
- logger.info(f"Topics required for this application: {topics_list}")
- if self._auto_create_topics:
- self._topic_manager.create_all_topics()
- self._topic_manager.validate_all_topics()
-
- def _process_message(self, dataframe_composed, start_state_transaction):
- # Serve producer callbacks
- self._producer.poll(self._producer_poll_timeout)
- rows = self._consumer.poll_row(timeout=self._consumer_poll_timeout)
-
- if rows is None:
- return
-
- # Deserializer may return multiple rows for a single message
- rows = rows if isinstance(rows, list) else [rows]
- if not rows:
- return
-
- first_row = rows[0]
- topic_name, partition, offset = (
- first_row.topic,
- first_row.partition,
- first_row.offset,
- )
-
- with start_state_transaction(
- topic=topic_name, partition=partition, offset=offset
- ):
- for row in rows:
- context = copy_context()
- context.run(set_message_context, first_row.context)
- try:
- # Execute StreamingDataFrame in a context
- context.run(dataframe_composed, row.value)
- except Filtered:
- # The message was filtered by StreamingDataFrame
- continue
- except Exception as exc:
- # TODO: This callback might be triggered because of Producer
- # errors too because they happen within ".process()"
- to_suppress = self._on_processing_error(exc, row, logger)
- if not to_suppress:
- raise
-
- # Store the message offset after it's successfully processed
- self._consumer.store_offsets(
- offsets=[
- TopicPartition(
- topic=topic_name,
- partition=partition,
- offset=offset + 1,
- )
- ]
- )
-
- if self._on_message_processed is not None:
- self._on_message_processed(topic_name, partition, offset)
-
def run(
self,
dataframe: StreamingDataFrame,
@@ -723,7 +664,7 @@ def run(
"""
Start processing data from Kafka using provided `StreamingDataFrame`
- One started, can be safely terminated with a `SIGTERM` signal
+ Once started, it can be safely terminated with a `SIGTERM` signal
(like Kubernetes does) or a typical `KeyboardInterrupt` (`Ctrl+C`).
@@ -751,7 +692,8 @@ def run(
f"Starting the Application with the config: "
f'broker_address="{self._broker_address}" '
f'consumer_group="{self._consumer_group}" '
- f'auto_offset_reset="{self._auto_offset_reset}"'
+ f'auto_offset_reset="{self._auto_offset_reset}" '
+ f"commit_interval={self._commit_interval}s"
)
if self._is_quix_app:
self._quix_runtime_init()
@@ -759,22 +701,11 @@ def run(
self._setup_topics()
exit_stack = contextlib.ExitStack()
- exit_stack.enter_context(self._producer)
- exit_stack.enter_context(self._consumer)
exit_stack.enter_context(self._state_manager)
-
- exit_stack.callback(
- lambda *_: logger.debug("Closing Kafka consumers & producers")
+ exit_stack.enter_context(self._consumer)
+ exit_stack.push(
+ lambda exc_type, exc_val, exc_tb: self.stop(fail=exc_val is not None)
)
- exit_stack.callback(lambda *_: self.stop())
-
- if self._state_manager.stores:
- # Store manager has stores registered, use real state transactions
- # during processing
- start_state_transaction = self._state_manager.start_store_transaction
- else:
- # Application is stateless, use dummy state transactions
- start_state_transaction = _dummy_state_transaction
with exit_stack:
# Subscribe to topics in Kafka and start polling
@@ -788,16 +719,83 @@ def run(
# Start polling Kafka for messages and callbacks
self._running = True
+ # Initialize the checkpoint
+ self._processing_context.init_checkpoint()
+
dataframe_composed = dataframe.compose()
while self._running:
if self._state_manager.recovery_required:
self._state_manager.do_recovery()
else:
- self._process_message(dataframe_composed, start_state_transaction)
+ self._process_message(dataframe_composed)
+ self._processing_context.commit_checkpoint()
logger.info("Stop processing of StreamingDataFrame")
+ def _quix_runtime_init(self):
+ """
+ Do a runtime setup only applicable to an Application.Quix instance
+ - Ensure that "State management" flag is enabled for deployment if the app
+ is stateful and is running in Quix Cloud
+ """
+ # Ensure that state management is enabled if application is stateful
+ if self._state_manager.stores:
+ check_state_management_enabled()
+
+ def _setup_topics(self):
+ topics_list = ", ".join(
+ f'"{topic.name}"' for topic in self._topic_manager.all_topics
+ )
+ logger.info(f"Topics required for this application: {topics_list}")
+ if self._auto_create_topics:
+ self._topic_manager.create_all_topics()
+ self._topic_manager.validate_all_topics()
+
+ def _process_message(self, dataframe_composed):
+ # Serve producer callbacks
+ self._producer.poll(self._producer_poll_timeout)
+ rows = self._consumer.poll_row(timeout=self._consumer_poll_timeout)
+
+ if rows is None:
+ return
+
+ # Deserializer may return multiple rows for a single message
+ rows = rows if isinstance(rows, list) else [rows]
+ if not rows:
+ return
+
+ first_row = rows[0]
+ topic_name, partition, offset = (
+ first_row.topic,
+ first_row.partition,
+ first_row.offset,
+ )
+
+ for row in rows:
+ context = copy_context()
+ context.run(set_message_context, row.context)
+ try:
+ # Execute StreamingDataFrame in a context
+ context.run(dataframe_composed, row.value)
+ except Filtered:
+ # The message was filtered by StreamingDataFrame
+ continue
+ except Exception as exc:
+ # TODO: This callback might be triggered because of Producer
+ # errors too because they happen within ".process()"
+ to_suppress = self._on_processing_error(exc, row, logger)
+ if not to_suppress:
+ raise
+
+ # Store the message offset after it's successfully processed
+ self._processing_context.store_offset(
+ topic=topic_name, partition=partition, offset=offset
+ )
+
+ if self._on_message_processed is not None:
+ self._on_message_processed(topic_name, partition, offset)
+
def _on_assign(self, _, topic_partitions: List[TopicPartition]):
"""
Assign new topic partitions to consumer and state.
@@ -807,6 +805,11 @@ def _on_assign(self, _, topic_partitions: List[TopicPartition]):
# sometimes "empty" calls happen, probably updating the consumer epoch
if not topic_partitions:
return
+
+ # First commit everything processed so far because assignment can take a while
+ # and fail
+ self._processing_context.commit_checkpoint(force=True)
+
# assigning manually here (instead of allowing it handle it automatically)
# enables pausing them during recovery to work as expected
self._consumer.incremental_assign(topic_partitions)
@@ -814,8 +817,14 @@ def _on_assign(self, _, topic_partitions: List[TopicPartition]):
if self._state_manager.stores:
logger.debug(f"Rebalancing: assigning state store partitions")
for tp in topic_partitions:
+ # Get the latest committed offset for the assigned topic partition
+ tp_committed = self._consumer.committed([tp], timeout=30)[0]
# Assign store partitions
- store_partitions = self._state_manager.on_partition_assign(tp)
+ store_partitions = self._state_manager.on_partition_assign(
+ topic=tp.topic,
+ partition=tp.partition,
+ committed_offset=tp_committed.offset,
+ )
# Check if the latest committed offset >= stored offset
# Otherwise, the re-processed messages might use already updated
@@ -828,26 +837,41 @@ def _on_assign(self, _, topic_partitions: List[TopicPartition]):
if offset is not None
]
min_stored_offset = min(stored_offsets) + 1 if stored_offsets else None
- if min_stored_offset is not None:
- tp_committed = self._consumer.committed([tp], timeout=30)[0]
- if min_stored_offset > tp_committed.offset:
- logger.warning(
- f'Warning: offset "{tp_committed.offset}" '
- f"for topic partition "
- f'"{tp_committed.topic}[{tp_committed.partition}]" '
- f'is behind the stored offset "{min_stored_offset}". '
- f"It may lead to distortions in produced data."
- )
+ if (
+ min_stored_offset is not None
+ and min_stored_offset > tp_committed.offset
+ ):
+ logger.warning(
+ f'Warning: offset "{tp_committed.offset}" '
+ f"for topic partition "
+ f'"{tp_committed.topic}[{tp_committed.partition}]" '
+ f'is behind the stored offset "{min_stored_offset}". '
+ f"It may lead to distortions in produced data."
+ )
def _on_revoke(self, _, topic_partitions: List[TopicPartition]):
"""
Revoke partitions from consumer and state
"""
+ # Commit everything processed so far unless the application is closing
+ # because of the unhandled exception.
+ # In this case, we should drop the checkpoint and let another consumer
+ # pick up from the latest one
+ if self._failed:
+ logger.warning(
+ "Application is stopping due to failure, "
+ "latest checkpoint will not be committed."
+ )
+ else:
+ self._processing_context.commit_checkpoint(force=True)
+
self._consumer.incremental_unassign(topic_partitions)
if self._state_manager.stores:
logger.debug(f"Rebalancing: revoking state store partitions")
for tp in topic_partitions:
- self._state_manager.on_partition_revoke(tp)
+ self._state_manager.on_partition_revoke(
+ topic=tp.topic, partition=tp.partition
+ )
def _on_lost(self, _, topic_partitions: List[TopicPartition]):
"""
@@ -856,7 +880,9 @@ def _on_lost(self, _, topic_partitions: List[TopicPartition]):
if self._state_manager.stores:
logger.debug(f"Rebalancing: dropping lost state store partitions")
for tp in topic_partitions:
- self._state_manager.on_partition_lost(tp)
+ self._state_manager.on_partition_revoke(
+ topic=tp.topic, partition=tp.partition
+ )
def _setup_signal_handlers(self):
signal.signal(signal.SIGINT, self._on_sigint)
@@ -872,10 +898,3 @@ def _on_sigint(self, *_):
def _on_sigterm(self, *_):
logger.debug(f"Received SIGTERM, stopping the processing loop")
self.stop()
-
-
-_nullcontext = contextlib.nullcontext()
-
-
-def _dummy_state_transaction(topic: str, partition: int, offset: int):
- return _nullcontext
diff --git a/quixstreams/checkpointing/__init__.py b/quixstreams/checkpointing/__init__.py
new file mode 100644
index 000000000..92235318b
--- /dev/null
+++ b/quixstreams/checkpointing/__init__.py
@@ -0,0 +1,2 @@
+from .checkpoint import Checkpoint as Checkpoint
+from .exceptions import InvalidStoredOffset as InvalidStoredOffset
diff --git a/quixstreams/checkpointing/checkpoint.py b/quixstreams/checkpointing/checkpoint.py
new file mode 100644
index 000000000..e437e3b17
--- /dev/null
+++ b/quixstreams/checkpointing/checkpoint.py
@@ -0,0 +1,166 @@
+import logging
+import time
+from typing import Dict, Tuple
+
+from confluent_kafka import TopicPartition
+
+from quixstreams.kafka import Consumer
+from quixstreams.rowproducer import RowProducer
+from quixstreams.state import (
+ StateStoreManager,
+ PartitionTransaction,
+ DEFAULT_STATE_STORE_NAME,
+)
+from quixstreams.state.exceptions import StoreTransactionFailed
+from .exceptions import InvalidStoredOffset
+
+logger = logging.getLogger(__name__)
+
+
+class Checkpoint:
+ """
+ Class to keep track of state updates and consumer offsets and to checkpoint these
+ updates on schedule.
+ """
+
+ def __init__(
+ self,
+ commit_interval: float,
+ producer: RowProducer,
+ consumer: Consumer,
+ state_manager: StateStoreManager,
+ ):
+ self._created_at = time.monotonic()
+ # A mapping of <(topic, partition): processed offset>
+ self._tp_offsets: Dict[Tuple[str, int], int] = {}
+
+ # A mapping of <(topic, partition, store_name): PartitionTransaction>
+ self._store_transactions: Dict[(str, int, str), PartitionTransaction] = {}
+ # Passing zero or lower will flush the checkpoint after each processed message
+ self._commit_interval = max(commit_interval, 0)
+ self._state_manager = state_manager
+ self._consumer = consumer
+ self._producer = producer
+
+ def expired(self) -> bool:
+ """
+ Returns `True` if checkpoint deadline has expired.
+ """
+ return (time.monotonic() - self._commit_interval) >= self._created_at
+
+ def empty(self) -> bool:
+ """
+ Returns `True` if checkpoint doesn't have any offsets stored yet.
+ :return:
+ """
+ return not bool(self._tp_offsets)
+
+ def store_offset(self, topic: str, partition: int, offset: int):
+ """
+ Store the offset of the processed message to the checkpoint.
+
+ :param topic: topic name
+ :param partition: partition number
+ :param offset: message offset
+ """
+ stored_offset = self._tp_offsets.get((topic, partition), -1)
+ # A paranoid check to ensure that processed offsets always increase within the
+ # same checkpoint.
+ # It shouldn't normally happen, but a lot of logic relies on it,
+ # and it's better to be safe.
+ if offset < stored_offset:
+ raise InvalidStoredOffset(
+ f"Cannot store offset smaller or equal than already processed"
+ f" one: {offset} <= {stored_offset}"
+ )
+ self._tp_offsets[(topic, partition)] = offset
+
+ def get_store_transaction(
+ self, topic: str, partition: int, store_name: str = DEFAULT_STATE_STORE_NAME
+ ) -> PartitionTransaction:
+ """
+ Get a PartitionTransaction for the given store, topic and partition.
+
+ It will return already started transaction if there's one.
+
+ :param topic: topic name
+ :param partition: partition number
+ :param store_name: store name
+ :return: instance of `PartitionTransaction`
+ """
+ transaction = self._store_transactions.get((topic, partition, store_name))
+ if transaction is not None:
+ return transaction
+
+ store = self._state_manager.get_store(topic=topic, store_name=store_name)
+ transaction = store.start_partition_transaction(partition=partition)
+
+ self._store_transactions[(topic, partition, store_name)] = transaction
+ return transaction
+
+ def commit(self):
+ """
+ Commit the checkpoint.
+
+ This method will:
+ 1. Produce the changelogs for each state store
+ 2. Flush the producer to ensure everything is delivered.
+ 3. Commit topic offsets.
+ 4. Flush each state store partition to the disk.
+ """
+
+ if not self._tp_offsets:
+ # No messages have been processed during this checkpoint, return
+ return
+
+ # Step 1. Produce the changelogs
+ for (
+ topic,
+ partition,
+ store_name,
+ ), transaction in self._store_transactions.items():
+ offset = self._tp_offsets[(topic, partition)]
+ if transaction.failed:
+ raise StoreTransactionFailed(
+ f'Detected a failed transaction for store "{store_name}", '
+ f"the checkpoint is aborted"
+ )
+ transaction.prepare(processed_offset=offset)
+
+ # Step 2. Flush producer to trigger all delivery callbacks and ensure that
+ # all messages are produced
+ self._producer.flush()
+ # Get produced offsets after flushing the producer
+ produced_offsets = self._producer.offsets
+
+ # Step 3. Commit offsets to Kafka
+ offsets = [
+ TopicPartition(topic=topic, partition=partition, offset=offset + 1)
+ for (topic, partition), offset in self._tp_offsets.items()
+ ]
+ self._consumer.commit(offsets=offsets, asynchronous=False)
+
+ # Step 4. Flush state store partitions to the disk together with changelog
+ # offsets
+ for (
+ topic,
+ partition,
+ store_name,
+ ), transaction in self._store_transactions.items():
+ offset = self._tp_offsets[(topic, partition)]
+
+ # Get the changelog topic-partition for the given transaction
+ # It can be None if changelog topics are disabled in the app config
+ changelog_tp = transaction.changelog_topic_partition
+ # The changelog offset also can be None if no updates happened
+ # during transaction
+ changelog_offset = (
+ produced_offsets.get(changelog_tp) if changelog_tp is not None else None
+ )
+ if changelog_offset is not None:
+ # Increment the changelog offset by one to match the high watermark
+ # in Kafka
+ changelog_offset += 1
+ transaction.flush(
+ processed_offset=offset, changelog_offset=changelog_offset
+ )
diff --git a/quixstreams/checkpointing/exceptions.py b/quixstreams/checkpointing/exceptions.py
new file mode 100644
index 000000000..fc0bff910
--- /dev/null
+++ b/quixstreams/checkpointing/exceptions.py
@@ -0,0 +1,4 @@
+from quixstreams.exceptions import QuixException
+
+
+class InvalidStoredOffset(QuixException): ...
diff --git a/quixstreams/dataframe/dataframe.py b/quixstreams/dataframe/dataframe.py
index 5fdbf2be2..f036cae54 100644
--- a/quixstreams/dataframe/dataframe.py
+++ b/quixstreams/dataframe/dataframe.py
@@ -11,12 +11,11 @@
from quixstreams.context import (
message_context,
set_message_context,
- message_key,
)
from quixstreams.core.stream import StreamCallable, Stream
from quixstreams.models import Topic, Row, MessageContext
-from quixstreams.rowproducer import RowProducerProto
-from quixstreams.state import StateStoreManager, State
+from quixstreams.processing_context import ProcessingContext
+from quixstreams.state import State
from .base import BaseStreaming
from .exceptions import InvalidOperation
from .series import StreamingSeries
@@ -79,13 +78,17 @@ class StreamingDataFrame(BaseStreaming):
def __init__(
self,
topic: Topic,
- state_manager: StateStoreManager,
+ processing_context: ProcessingContext,
stream: Optional[Stream] = None,
):
self._stream: Stream = stream or Stream()
self._topic = topic
- self._real_producer: Optional[RowProducerProto] = None
- self._state_manager = state_manager
+ self._processing_context = processing_context
+ self._producer = processing_context.producer
+
+ @property
+ def processing_context(self) -> ProcessingContext:
+ return self._processing_context
@property
def stream(self) -> Stream:
@@ -95,10 +98,6 @@ def stream(self) -> Stream:
def topic(self) -> Topic:
return self._topic
- @property
- def state_manager(self) -> StateStoreManager:
- return self._state_manager
-
def __bool__(self):
raise InvalidOperation(
f"Cannot assess truth level of a {self.__class__.__name__} "
@@ -144,7 +143,7 @@ def func(d: dict, state: State):
"""
if stateful:
self._register_store()
- func = _as_stateful(func=func, state_manager=self._state_manager)
+ func = _as_stateful(func=func, processing_context=self._processing_context)
stream = self.stream.add_apply(func, expand=expand)
return self._clone(stream=stream)
@@ -183,7 +182,7 @@ def func(values: list, state: State):
"""
if stateful:
self._register_store()
- func = _as_stateful(func=func, state_manager=self._state_manager)
+ func = _as_stateful(func=func, processing_context=self._processing_context)
stream = self.stream.add_update(func)
return self._clone(stream=stream)
@@ -225,21 +224,11 @@ def func(d: dict, state: State):
if stateful:
self._register_store()
- func = _as_stateful(func=func, state_manager=self._state_manager)
+ func = _as_stateful(func=func, processing_context=self._processing_context)
stream = self.stream.add_filter(func)
return self._clone(stream=stream)
- @property
- def producer(self) -> RowProducerProto:
- if self._real_producer is None:
- raise RuntimeError("Producer instance has not been provided")
- return self._real_producer
-
- @producer.setter
- def producer(self, producer: RowProducerProto):
- self._real_producer = producer
-
@staticmethod
def contains(key: str) -> StreamingSeries:
"""
@@ -519,23 +508,25 @@ def hopping_window(
def _clone(self, stream: Stream) -> Self:
clone = self.__class__(
- stream=stream, topic=self._topic, state_manager=self._state_manager
+ stream=stream,
+ topic=self._topic,
+ processing_context=self._processing_context,
)
- if self._real_producer is not None:
- clone.producer = self._real_producer
return clone
def _produce(self, topic: Topic, value: object, key: Optional[object] = None):
ctx = message_context()
key = key or ctx.key
row = Row(value=value, context=ctx) # noqa
- self.producer.produce_row(row, topic, key=key)
+ self._producer.produce_row(row, topic, key=key)
def _register_store(self):
"""
Register the default store for input topic in StateStoreManager
"""
- self._state_manager.register_store(topic_name=self._topic.name)
+ self._processing_context.state_manager.register_store(
+ topic_name=self._topic.name
+ )
def __setitem__(self, key, value: Union[Self, object]):
if isinstance(value, self.__class__):
@@ -579,22 +570,24 @@ def __getitem__(
# Take only certain keys from the dict and return a new dict
return self.apply(lambda v: {k: v[k] for k in item})
elif isinstance(item, str):
- # Create a StreamingSeries based on key
+ # Create a StreamingSeries based on a column name
return StreamingSeries(name=item)
else:
raise TypeError(f'Unsupported key type "{type(item)}"')
def _as_stateful(
- func: DataFrameStatefulFunc, state_manager: StateStoreManager
+ func: DataFrameStatefulFunc, processing_context: ProcessingContext
) -> DataFrameFunc:
@functools.wraps(func)
def wrapper(value: object) -> object:
- transaction = state_manager.get_store_transaction()
- key = message_key()
- # Prefix all the state keys by the message key
- with transaction.with_prefix(prefix=key):
- # Pass a State object with an interface limited to the key updates only
- return func(value, transaction.state)
+ ctx = message_context()
+ transaction = processing_context.checkpoint.get_store_transaction(
+ topic=ctx.topic, partition=ctx.partition
+ )
+ # Pass a State object with an interface limited to the key updates only
+ # and prefix all the state keys by the message key
+ state = transaction.as_state(prefix=ctx.key)
+ return func(value, state)
return wrapper
diff --git a/quixstreams/dataframe/windows/time_based.py b/quixstreams/dataframe/windows/time_based.py
index 609483f4c..1e0f02c63 100644
--- a/quixstreams/dataframe/windows/time_based.py
+++ b/quixstreams/dataframe/windows/time_based.py
@@ -1,16 +1,10 @@
-import logging
import functools
+import logging
from typing import Any, Optional, List, TYPE_CHECKING, cast, Tuple
-from quixstreams.context import (
- message_context,
- message_key,
-)
-from quixstreams.state import (
- StateStoreManager,
- WindowedPartitionTransaction,
- WindowedState,
-)
+from quixstreams.context import message_context
+from quixstreams.processing_context import ProcessingContext
+from quixstreams.state import WindowedPartitionTransaction, WindowedState
from .base import (
WindowedDataFrameFunc,
WindowAggregateFunc,
@@ -161,7 +155,7 @@ def current(self, expand: bool = True) -> "StreamingDataFrame":
)
def register_store(self):
- self._dataframe.state_manager.register_windowed_store(
+ self._dataframe.processing_context.state_manager.register_windowed_store(
topic_name=self._dataframe.topic.name, store_name=self._name
)
@@ -177,7 +171,9 @@ def _apply_window(
self.register_store()
func = _as_windowed(
- func=func, state_manager=self._dataframe.state_manager, store_name=name
+ func=func,
+ processing_context=self._dataframe.processing_context,
+ store_name=name,
)
return self._dataframe.apply(func=func, expand=expand)
@@ -194,23 +190,25 @@ def _noop() -> Any:
def _as_windowed(
- func: WindowedDataFrameFunc, state_manager: StateStoreManager, store_name: str
+ func: WindowedDataFrameFunc, processing_context: ProcessingContext, store_name: str
) -> "DataFrameFunc":
@functools.wraps(func)
def wrapper(value: object) -> object:
+ ctx = message_context()
+ key = ctx.key
transaction = cast(
WindowedPartitionTransaction,
- state_manager.get_store_transaction(store_name=store_name),
+ processing_context.checkpoint.get_store_transaction(
+ topic=ctx.topic, partition=ctx.partition, store_name=store_name
+ ),
)
- key = message_key()
if key is None:
- ctx = message_context()
logger.warning(
f"Skipping window processing for a message because the key is None, "
f"partition='{ctx.topic}[{ctx.partition}]' offset='{ctx.offset}'."
)
return _noop()
- with transaction.with_prefix(prefix=key):
- return func(value, transaction.state)
+ state = transaction.as_state(prefix=key)
+ return func(value, state)
return wrapper
diff --git a/quixstreams/kafka/exceptions.py b/quixstreams/kafka/exceptions.py
new file mode 100644
index 000000000..ba9c71ce7
--- /dev/null
+++ b/quixstreams/kafka/exceptions.py
@@ -0,0 +1,32 @@
+from confluent_kafka import KafkaError
+
+from quixstreams.exceptions import QuixException
+
+
+class BaseKafkaException(QuixException):
+ def __init__(self, error: KafkaError):
+ self.error = error
+
+ @property
+ def code(self) -> int:
+ return self.error.code()
+
+ @property
+ def description(self):
+ return self.error.str()
+
+ def __str__(self):
+ return (
+ f"<{self.__class__.__name__} "
+ f'code="{self.code}" '
+ f'description="{self.description}">'
+ )
+
+ def __repr__(self):
+ return str(self)
+
+
+class KafkaConsumerException(BaseKafkaException): ...
+
+
+class KafkaProducerDeliveryError(BaseKafkaException): ...
diff --git a/quixstreams/kafka/producer.py b/quixstreams/kafka/producer.py
index 841065b1f..63be26aeb 100644
--- a/quixstreams/kafka/producer.py
+++ b/quixstreams/kafka/producer.py
@@ -1,13 +1,14 @@
import logging
-from typing import Union, Optional
-from typing_extensions import Literal
-from quixstreams.models.types import Headers
+from typing import Union, Optional, Callable
from confluent_kafka import (
Producer as ConfluentProducer,
KafkaError,
Message,
)
+from typing_extensions import Literal
+
+from quixstreams.models.types import Headers
__all__ = (
"Producer",
@@ -18,6 +19,8 @@
"random", "consistent_random", "murmur2", "murmur2_random", "fnv1a", "fnv1a_random"
]
+DeliveryCallback = Callable[[Optional[KafkaError], Message], None]
+
logger = logging.getLogger(__name__)
@@ -31,26 +34,6 @@ def _default_error_cb(error: KafkaError):
)
-def _on_delivery_cb(err: Optional[KafkaError], msg: Message):
- if err is not None:
- logger.debug(
- 'Delivery failed: topic="%s" partition="%s" key="%s" error=%s ' "code=%s",
- msg.topic(),
- msg.partition(),
- msg.key(),
- err.str(),
- err.code(),
- )
- else:
- logger.debug(
- 'Delivery succeeded: topic="%s" partition="%s" key="%s" value="%s"',
- msg.topic(),
- msg.partition(),
- msg.key(),
- msg.value(),
- )
-
-
class Producer:
def __init__(
self,
@@ -87,9 +70,6 @@ def __init__(
)
self._producer_config = config
self._inner_producer: Optional[ConfluentProducer] = None
- # Optimization: pass `on_delivery` callbacks only in "debug" mode, otherwise
- # it significantly reduces a throughput because of additional function calls
- self._enable_delivery_callbacks = logger.isEnabledFor(logging.DEBUG)
def produce(
self,
@@ -101,10 +81,12 @@ def produce(
timestamp: Optional[int] = None,
poll_timeout: float = 5.0,
buffer_error_max_tries: int = 3,
+ on_delivery: Optional[DeliveryCallback] = None,
):
"""
- Produce message to topic.
- It also polls Kafka for callbacks before producing in order to minimize
+ Produce a message to a topic.
+
+ It also polls Kafka for callbacks before producing to minimize
the probability of `BufferError`.
If `BufferError` still happens, the method will poll Kafka with timeout
to free up the buffer and try again.
@@ -118,6 +100,8 @@ def produce(
:param poll_timeout: timeout for `poll()` call in case of `BufferError`
:param buffer_error_max_tries: max retries for `BufferError`.
Pass `0` to not retry after `BufferError`.
+ :param on_delivery: the delivery callback to be triggered on `poll()`
+ for the produced message.
"""
@@ -125,9 +109,8 @@ def produce(
"partition": partition,
"timestamp": timestamp,
"headers": headers,
+ "on_delivery": on_delivery,
}
- if self._enable_delivery_callbacks:
- kwargs["on_delivery"] = _on_delivery_cb
# confluent_kafka doesn't like None for optional parameters
kwargs = {k: v for k, v in kwargs.items() if v is not None}
diff --git a/quixstreams/processing_context.py b/quixstreams/processing_context.py
new file mode 100644
index 000000000..91b3dcc2a
--- /dev/null
+++ b/quixstreams/processing_context.py
@@ -0,0 +1,80 @@
+import dataclasses
+import logging
+import time
+from typing import Optional
+
+from quixstreams.checkpointing import Checkpoint
+from quixstreams.exceptions import QuixException
+from quixstreams.rowconsumer import RowConsumer
+from quixstreams.rowproducer import RowProducer
+from quixstreams.state import StateStoreManager
+
+__all__ = ("ProcessingContext",)
+
+logger = logging.getLogger(__name__)
+
+
+class CheckpointNotInitialized(QuixException): ...
+
+
+@dataclasses.dataclass
+class ProcessingContext:
+ """
+ A class to share processing-related objects
+ between `Application` and `StreamingDataFrame` instances.
+ """
+
+ commit_interval: float
+ producer: RowProducer
+ consumer: RowConsumer
+ state_manager: StateStoreManager
+ _checkpoint: Optional[Checkpoint] = dataclasses.field(
+ init=False, repr=False, default=None
+ )
+
+ @property
+ def checkpoint(self) -> Checkpoint:
+ if self._checkpoint is None:
+ raise CheckpointNotInitialized("Checkpoint has not been initialized yet")
+ return self._checkpoint
+
+ def store_offset(self, topic: str, partition: int, offset: int):
+ """
+ Store the offset of the processed message to the checkpoint.
+
+ :param topic: topic name
+ :param partition: partition number
+ :param offset: message offset
+ """
+ self._checkpoint.store_offset(topic=topic, partition=partition, offset=offset)
+
+ def init_checkpoint(self):
+ """
+ Initialize a new checkpoint
+ """
+ self._checkpoint = Checkpoint(
+ commit_interval=self.commit_interval,
+ state_manager=self.state_manager,
+ producer=self.producer,
+ consumer=self.consumer,
+ )
+
+ def commit_checkpoint(self, force: bool = False):
+ """
+ Commit the current checkpoint.
+
+ The actual commit will happen only when:
+
+ 1. The checkpoint has at least one stored offset
+ 2. The checkpoint is expired or `force=True` is passed
+
+ :param force: if `True`, commit the checkpoint before its expiration deadline.
+ """
+ if not self._checkpoint.empty() and (self._checkpoint.expired() or force):
+
+ logger.info(f"Committing a checkpoint force={force}")
+ start = time.monotonic()
+ self._checkpoint.commit()
+ elapsed = round(time.monotonic() - start, 2)
+ logger.info(f"Committed a checkpoint force={force} time_elapsed={elapsed}s")
+ self.init_checkpoint()
diff --git a/quixstreams/rowconsumer.py b/quixstreams/rowconsumer.py
index e56a2013f..21a99fe03 100644
--- a/quixstreams/rowconsumer.py
+++ b/quixstreams/rowconsumer.py
@@ -2,59 +2,21 @@
from typing import Optional, Callable, List, Union, Mapping
from confluent_kafka import KafkaError, TopicPartition
-from typing_extensions import Protocol
from .error_callbacks import ConsumerErrorCallback, default_on_consumer_error
-from .exceptions import QuixException, PartitionAssignmentError
+from .exceptions import PartitionAssignmentError
from .kafka import Consumer, AssignmentStrategy, AutoOffsetReset
from .kafka.consumer import RebalancingCallback
+from .kafka.exceptions import KafkaConsumerException
from .models import Topic, Row
from .models.serializers.exceptions import IgnoreMessage
logger = logging.getLogger(__name__)
+__all__ = ("RowConsumer",)
-class KafkaMessageError(QuixException):
- def __init__(self, error: KafkaError):
- self.error = error
- @property
- def code(self) -> int:
- return self.error.code()
-
- @property
- def description(self):
- return self.error.str()
-
- def __str__(self):
- return (
- f"<{self.__class__.__name__} "
- f'code="{self.code}" '
- f'description="{self.description}">'
- )
-
- def __repr__(self):
- return str(self)
-
-
-class RowConsumerProto(Protocol):
- def commit(
- self,
- message=None,
- offsets: List[TopicPartition] = None,
- asynchronous: bool = True,
- ) -> Optional[List[TopicPartition]]: ...
-
- def subscribe(
- self,
- topics: List[Topic],
- on_assign: Optional[RebalancingCallback] = None,
- on_revoke: Optional[RebalancingCallback] = None,
- on_lost: Optional[RebalancingCallback] = None,
- ): ...
-
-
-class RowConsumer(Consumer, RowConsumerProto):
+class RowConsumer(Consumer):
def __init__(
self,
broker_address: str,
@@ -172,7 +134,7 @@ def poll_row(self, timeout: float = None) -> Union[Row, List[Row], None]:
topic_name, partition, offset = msg.topic(), msg.partition(), msg.offset()
try:
if msg.error():
- raise KafkaMessageError(error=msg.error())
+ raise KafkaConsumerException(error=msg.error())
topic = self._topics[topic_name]
diff --git a/quixstreams/rowproducer.py b/quixstreams/rowproducer.py
index ab6845ed6..76322ac69 100644
--- a/quixstreams/rowproducer.py
+++ b/quixstreams/rowproducer.py
@@ -1,49 +1,36 @@
import logging
-from typing import Optional, Any
+from typing import Optional, Any, Union, Dict, Tuple
-from typing_extensions import Protocol
+from confluent_kafka import KafkaError, Message
from .error_callbacks import ProducerErrorCallback, default_on_producer_error
+from .kafka.exceptions import KafkaProducerDeliveryError
from .kafka.producer import Producer, Partitioner
-from .models import Topic, Row
+from .models import Topic, Row, Headers
logger = logging.getLogger(__name__)
-class RowProducerProto(Protocol):
- def produce_row(
- self,
- row: Row,
- topic: Topic,
- key: Optional[Any] = None,
- partition: Optional[int] = None,
- timestamp: Optional[int] = None,
- ): ...
-
-
-class RowProducer(Producer, RowProducerProto):
+class RowProducer:
"""
A producer class that is capable of serializing Rows to bytes and send them to Kafka.
The serialization is performed according to the Topic serialization settings.
- It overrides `.subscribe()` method of Consumer class to accept `Topic`
- objects instead of strings.
-
- :param broker_address: Kafka broker host and port in format `:`.
- Passed as `bootstrap.servers` to `confluent_kafka.Producer`.
- :param partitioner: A function to be used to determine the outgoing message
- partition.
- Available values: "random", "consistent_random", "murmur2", "murmur2_random",
- "fnv1a", "fnv1a_random"
- Default - "murmur2".
- :param extra_config: A dictionary with additional options that
- will be passed to `confluent_kafka.Producer` as is.
- Note: values passed as arguments override values in `extra_config`.
- :param on_error: a callback triggered when `RowProducer.produce_row()`
- or `RowProducer.poll()` fail`.
- If producer fails and the callback returns `True`, the exception
- will be logged but not propagated.
- The default callback logs an exception and returns `False`.
+ :param broker_address: Kafka broker host and port in format `:`.
+ Passed as `bootstrap.servers` to `confluent_kafka.Producer`.
+ :param partitioner: A function to be used to determine the outgoing message
+ partition.
+ Available values: "random", "consistent_random", "murmur2", "murmur2_random",
+ "fnv1a", "fnv1a_random"
+ Default - "murmur2".
+ :param extra_config: A dictionary with additional options that
+ will be passed to `confluent_kafka.Producer` as is.
+ Note: values passed as arguments override values in `extra_config`.
+ :param on_error: a callback triggered when `RowProducer.produce_row()`
+ or `RowProducer.poll()` fail`.
+ If producer fails and the callback returns `True`, the exception
+ will be logged but not propagated.
+ The default callback logs an exception and returns `False`.
"""
def __init__(
@@ -53,14 +40,17 @@ def __init__(
extra_config: dict = None,
on_error: Optional[ProducerErrorCallback] = None,
):
- super().__init__(
+ self._producer = Producer(
broker_address=broker_address,
partitioner=partitioner,
extra_config=extra_config,
)
+
self._on_error: Optional[ProducerErrorCallback] = (
on_error or default_on_producer_error
)
+ self._tp_offsets: Dict[Tuple[str, int], int] = {}
+ self._error: Optional[KafkaError] = None
def produce_row(
self,
@@ -103,14 +93,71 @@ def poll(self, timeout: float = None):
"""
Polls the producer for events and calls `on_delivery` callbacks.
- If poll fails, it will trigger the provided "on_error" callback
+ If `poll()` fails, it will trigger the provided "on_error" callback
:param timeout: timeout in seconds
"""
try:
- super().poll(timeout=timeout)
+ self._producer.poll(timeout=timeout)
except Exception as exc:
to_suppress = self._on_error(exc, None, logger)
if to_suppress:
return
raise
+
+ def produce(
+ self,
+ topic: str,
+ value: Optional[Union[str, bytes]] = None,
+ key: Optional[Union[str, bytes]] = None,
+ headers: Optional[Headers] = None,
+ partition: Optional[int] = None,
+ timestamp: Optional[int] = None,
+ poll_timeout: float = 5.0,
+ buffer_error_max_tries: int = 3,
+ ):
+ self._raise_for_error()
+
+ return self._producer.produce(
+ topic=topic,
+ value=value,
+ key=key,
+ headers=headers,
+ partition=partition,
+ timestamp=timestamp,
+ poll_timeout=poll_timeout,
+ buffer_error_max_tries=buffer_error_max_tries,
+ on_delivery=self._on_delivery,
+ )
+
+ def _on_delivery(self, err: Optional[KafkaError], msg: Message):
+ if self._error is not None:
+ # There's an error already set
+ return
+
+ topic, partition, offset = msg.topic(), msg.partition(), msg.offset()
+ if err is None:
+ self._tp_offsets[(topic, partition)] = offset
+ else:
+ self._error = err
+
+ def _raise_for_error(self):
+ if self._error is not None:
+ exc = KafkaProducerDeliveryError(self._error)
+ self._error = None
+ raise exc
+
+ def flush(self, timeout: Optional[float] = None) -> int:
+ result = self._producer.flush(timeout=timeout)
+ self._raise_for_error()
+ return result
+
+ @property
+ def offsets(self) -> Dict[Tuple[str, int], int]:
+ return self._tp_offsets
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, exc_type, exc_val, exc_tb):
+ self.flush()
diff --git a/quixstreams/state/exceptions.py b/quixstreams/state/exceptions.py
index ee1ba062b..7ec216213 100644
--- a/quixstreams/state/exceptions.py
+++ b/quixstreams/state/exceptions.py
@@ -14,3 +14,6 @@ class WindowedStoreAlreadyRegisteredError(QuixException): ...
class InvalidStoreTransactionStateError(QuixException): ...
+
+
+class StoreTransactionFailed(QuixException): ...
diff --git a/quixstreams/state/manager.py b/quixstreams/state/manager.py
index d83190a8e..a2a1d51ec 100644
--- a/quixstreams/state/manager.py
+++ b/quixstreams/state/manager.py
@@ -1,31 +1,24 @@
-import contextlib
import logging
import shutil
from pathlib import Path
-from typing import List, Dict, Optional, Iterator
+from typing import List, Dict, Optional
from quixstreams.rowproducer import RowProducer
-from quixstreams.types import TopicPartition
from .exceptions import (
StoreNotRegisteredError,
- InvalidStoreTransactionStateError,
PartitionStoreIsUsed,
WindowedStoreAlreadyRegisteredError,
)
from .recovery import RecoveryManager, ChangelogProducerFactory
from .rocksdb import RocksDBStore, RocksDBOptionsType
from .rocksdb.windowed.store import WindowedRocksDBStore
-from .types import (
- Store,
- PartitionTransaction,
- StorePartition,
-)
+from .types import Store, StorePartition
-__all__ = ("StateStoreManager",)
+__all__ = ("StateStoreManager", "DEFAULT_STATE_STORE_NAME")
logger = logging.getLogger(__name__)
-_DEFAULT_STATE_STORE_NAME = "default"
+DEFAULT_STATE_STORE_NAME = "default"
class StateStoreManager:
@@ -52,7 +45,6 @@ def __init__(
self._stores: Dict[str, Dict[str, Store]] = {}
self._producer = producer
self._recovery_manager = recovery_manager
- self._transaction: Optional[_MultiStoreTransaction] = None
def _init_state_dir(self):
logger.info(f'Initializing state directory at "{self._state_dir}"')
@@ -106,7 +98,7 @@ def stop_recovery(self):
return self._recovery_manager.stop_recovery()
def get_store(
- self, topic: str, store_name: str = _DEFAULT_STATE_STORE_NAME
+ self, topic: str, store_name: str = DEFAULT_STATE_STORE_NAME
) -> Store:
"""
Get a store for given name and topic
@@ -129,17 +121,18 @@ def _setup_changelogs(
f'State Manager: registering changelog for store "{store_name}" '
f'(topic "{topic_name}")'
)
+ changelog_topic = self._recovery_manager.register_changelog(
+ topic_name=topic_name,
+ store_name=store_name,
+ consumer_group=self._group_id,
+ )
return ChangelogProducerFactory(
- self._recovery_manager.register_changelog(
- topic_name=topic_name,
- store_name=store_name,
- consumer_group=self._group_id,
- ).name,
- self._producer,
+ changelog_name=changelog_topic.name,
+ producer=self._producer,
)
def register_store(
- self, topic_name: str, store_name: str = _DEFAULT_STATE_STORE_NAME
+ self, topic_name: str, store_name: str = DEFAULT_STATE_STORE_NAME
):
"""
Register a state store to be managed by StateStoreManager.
@@ -201,45 +194,44 @@ def clear_stores(self):
shutil.rmtree(self._state_dir)
- def on_partition_assign(self, tp: TopicPartition) -> List[StorePartition]:
+ def on_partition_assign(
+ self, topic: str, partition: int, committed_offset: int
+ ) -> List[StorePartition]:
"""
Assign store partitions for each registered store for the given `TopicPartition`
and return a list of assigned `StorePartition` objects.
- :param tp: `TopicPartition` from Kafka consumer
+ :param topic: Kafka topic name
+ :param partition: Kafka topic partition
+ :param committed_offset: latest committed offset for the partition
:return: list of assigned `StorePartition`
"""
store_partitions = {}
- for name, store in self._stores.get(tp.topic, {}).items():
- store_partition = store.assign_partition(tp.partition)
+ for name, store in self._stores.get(topic, {}).items():
+ store_partition = store.assign_partition(partition)
store_partitions[name] = store_partition
if self._recovery_manager and store_partitions:
self._recovery_manager.assign_partition(
- tp.topic, tp.partition, store_partitions
+ topic=topic,
+ partition=partition,
+ committed_offset=committed_offset,
+ store_partitions=store_partitions,
)
return list(store_partitions.values())
- def on_partition_revoke(self, tp: TopicPartition):
+ def on_partition_revoke(self, topic: str, partition: int):
"""
Revoke store partitions for each registered store for the given `TopicPartition`
- :param tp: `TopicPartition` from Kafka consumer
+ :param topic: Kafka topic name
+ :param partition: Kafka topic partition
"""
- if stores := self._stores.get(tp.topic, {}).values():
+ if stores := self._stores.get(topic, {}).values():
if self._recovery_manager:
- self._recovery_manager.revoke_partition(tp.partition)
+ self._recovery_manager.revoke_partition(partition_num=partition)
for store in stores:
- store.revoke_partition(tp.partition)
-
- def on_partition_lost(self, tp: TopicPartition):
- """
- Revoke and close store partitions for each registered store for the given
- `TopicPartition`
-
- :param tp: `TopicPartition` from Kafka consumer
- """
- self.on_partition_revoke(tp)
+ store.revoke_partition(partition=partition)
def init(self):
"""
@@ -256,123 +248,9 @@ def close(self):
for store in topic_stores.values():
store.close()
- def get_store_transaction(
- self, store_name: str = _DEFAULT_STATE_STORE_NAME
- ) -> PartitionTransaction:
- """
- Get active `PartitionTransaction` for the store
- :param store_name:
- :return:
- """
- if self._transaction is None:
- raise InvalidStoreTransactionStateError(
- "Store transaction is not started yet"
- )
- return self._transaction.get_store_transaction(store_name=store_name)
-
- @contextlib.contextmanager
- def start_store_transaction(
- self, topic: str, partition: int, offset: int
- ) -> Iterator["_MultiStoreTransaction"]:
- """
- Starting the multi-store transaction for the Kafka message.
-
- This transaction will keep track of all used stores and flush them in the end.
- If any exception is caught during this transaction, none of them
- will be flushed as a best effort to keep stores consistent in "at-least-once" setting.
-
- There can be only one active transaction at a time. Starting a new transaction
- before the end of the current one will fail.
-
-
- :param topic: message topic
- :param partition: message partition
- :param offset: message offset
- """
- if not self._stores.get(topic):
- raise StoreNotRegisteredError(
- f'Topic "{topic}" does not have stores registered'
- )
-
- if self._transaction is not None:
- raise InvalidStoreTransactionStateError(
- "Another transaction is already in progress"
- )
- self._transaction = _MultiStoreTransaction(
- manager=self, topic=topic, partition=partition, offset=offset
- )
- try:
- yield self._transaction
- self._transaction.flush()
- finally:
- self._transaction = None
-
def __enter__(self):
self.init()
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.close()
-
-
-class _MultiStoreTransaction:
- """
- A transaction-like class to manage flushing of multiple state partitions for each
- processed message.
-
- It is responsible for:
- - Keeping track of actual DBTransactions for the individual stores
- - Flushing of the opened transactions in the end
-
- """
-
- def __init__(
- self, manager: "StateStoreManager", topic: str, partition: int, offset: int
- ):
- self._manager = manager
- self._transactions: Dict[str, PartitionTransaction] = {}
- self._topic = topic
- self._partition = partition
- self._offset = offset
-
- def get_store_transaction(
- self, store_name: str = _DEFAULT_STATE_STORE_NAME
- ) -> PartitionTransaction:
- """
- Get a PartitionTransaction for the given store
-
- It will return already started transaction if there's one.
-
- :param store_name: store name
- :return: instance of `PartitionTransaction`
- """
- transaction = self._transactions.get(store_name)
- if transaction is not None:
- return transaction
-
- store = self._manager.get_store(topic=self._topic, store_name=store_name)
- transaction = store.start_partition_transaction(partition=self._partition)
- self._transactions[store_name] = transaction
- return transaction
-
- def flush(self):
- """
- Flush all `PartitionTransaction` instances for each registered store and
- save the last processed offset for each partition.
-
- Empty transactions without any updates will not be flushed.
-
- If there are any failed transactions, no transactions will be flushed
- to keep the stores consistent.
- """
- for store_name, transaction in self._transactions.items():
- if transaction.failed:
- logger.warning(
- f'Detected failed transaction for store "{store_name}" '
- f'(topic "{self._topic}" partition "{self._partition}" '
- f'offset "{self._offset}), state transactions will not be flushed"'
- )
- return
-
- for transaction in self._transactions.values():
- transaction.maybe_flush(offset=self._offset)
diff --git a/quixstreams/state/recovery.py b/quixstreams/state/recovery.py
index daeb6301c..c26ef6236 100644
--- a/quixstreams/state/recovery.py
+++ b/quixstreams/state/recovery.py
@@ -4,7 +4,7 @@
from confluent_kafka import TopicPartition as ConfluentPartition
from quixstreams.kafka import Consumer
-from quixstreams.models import ConfluentKafkaMessageProto
+from quixstreams.models import ConfluentKafkaMessageProto, Topic
from quixstreams.models.topics import TopicManager
from quixstreams.models.types import MessageHeadersMapping
from quixstreams.rowproducer import RowProducer
@@ -13,8 +13,12 @@
logger = logging.getLogger(__name__)
-
-__all__ = ("ChangelogProducer", "ChangelogProducerFactory", "RecoveryManager")
+__all__ = (
+ "ChangelogProducer",
+ "ChangelogProducerFactory",
+ "RecoveryManager",
+ "RecoveryPartition",
+)
class RecoveryPartition:
@@ -30,12 +34,30 @@ def __init__(
changelog_name: str,
partition_num: int,
store_partition: StorePartition,
+ committed_offset: int,
):
- self.changelog_name = changelog_name
- self.partition_num = partition_num
- self.store_partition = store_partition
+ self._changelog_name = changelog_name
+ self._partition_num = partition_num
+ self._store_partition = store_partition
self._changelog_lowwater: Optional[int] = None
self._changelog_highwater: Optional[int] = None
+ self._committed_offset = committed_offset
+
+ @property
+ def changelog_name(self) -> str:
+ return self._changelog_name
+
+ @property
+ def partition_num(self) -> int:
+ return self._partition_num
+
+ @property
+ def changelog_highwater(self) -> Optional[int]:
+ return self._changelog_highwater
+
+ @property
+ def changelog_lowwater(self) -> Optional[int]:
+ return self._changelog_lowwater
@property
def offset(self) -> int:
@@ -44,7 +66,7 @@ def offset(self) -> int:
:return: changelog offset (int)
"""
- return self.store_partition.get_changelog_offset() or 0
+ return self._store_partition.get_changelog_offset() or 0
@property
def needs_recovery(self):
@@ -52,7 +74,7 @@ def needs_recovery(self):
Determine whether recovery is necessary for underlying `StorePartition`.
"""
has_consumable_offsets = self._changelog_lowwater != self._changelog_highwater
- state_is_behind = (self._changelog_highwater - self.offset) > 0
+ state_is_behind = self._changelog_highwater > self.offset
return has_consumable_offsets and state_is_behind
@property
@@ -62,7 +84,7 @@ def needs_offset_update(self):
Usually checked during assign if recovery was not required.
"""
- return self._changelog_highwater and (self.offset != self._changelog_highwater)
+ return self._changelog_highwater and (self._changelog_highwater < self.offset)
def update_offset(self):
"""
@@ -80,7 +102,7 @@ def update_offset(self):
f"network issues. State may be inaccurate for any affected keys. "
f"The offset will now be set to {self._changelog_highwater}."
)
- self.store_partition.set_changelog_offset(
+ self._store_partition.set_changelog_offset(
changelog_offset=self._changelog_highwater - 1
)
@@ -92,8 +114,8 @@ def recover_from_changelog_message(
:param changelog_message: A confluent kafka message (everything as bytes)
"""
- self.store_partition.recover_from_changelog_message(
- changelog_message=changelog_message
+ self._store_partition.recover_from_changelog_message(
+ changelog_message=changelog_message, committed_offset=self._committed_offset
)
def set_watermarks(self, lowwater: int, highwater: int):
@@ -122,7 +144,7 @@ def __init__(self, changelog_name: str, producer: RowProducer):
self._changelog_name = changelog_name
self._producer = producer
- def get_partition_producer(self, partition_num):
+ def get_partition_producer(self, partition_num) -> "ChangelogProducer":
"""
Generate a ChangelogProducer for producing to a specific partition number
(and thus StorePartition).
@@ -130,7 +152,9 @@ def get_partition_producer(self, partition_num):
:param partition_num: source topic partition number
"""
return ChangelogProducer(
- self._changelog_name, partition_num, producer=self._producer
+ changelog_name=self._changelog_name,
+ partition=partition_num,
+ producer=self._producer,
)
@@ -140,16 +164,29 @@ class ChangelogProducer:
kafka changelog partition.
"""
- def __init__(self, changelog_name: str, partition_num: int, producer: RowProducer):
+ def __init__(
+ self,
+ changelog_name: str,
+ partition: int,
+ producer: RowProducer,
+ ):
"""
:param changelog_name: A changelog topic name
- :param partition_num: source topic partition number
+ :param partition: source topic partition number
:param producer: a RowProducer (not shared with `Application` instance)
"""
self._changelog_name = changelog_name
- self._partition_num = partition_num
+ self._partition = partition
self._producer = producer
+ @property
+ def changelog_name(self) -> str:
+ return self._changelog_name
+
+ @property
+ def partition(self) -> int:
+ return self._partition
+
def produce(
self,
key: bytes,
@@ -167,12 +204,12 @@ def produce(
key=key,
value=value,
headers=headers,
- partition=self._partition_num,
+ partition=self._partition,
topic=self._changelog_name,
)
- def flush(self):
- self._producer.flush()
+ def flush(self, timeout: Optional[float] = None) -> int:
+ return self._producer.flush(timeout=timeout)
class RecoveryManager:
@@ -193,6 +230,14 @@ def __init__(self, consumer: Consumer, topic_manager: TopicManager):
self._topic_manager = topic_manager
self._recovery_partitions: Dict[int, Dict[str, RecoveryPartition]] = {}
+ @property
+ def partitions(self) -> Dict[int, Dict[str, RecoveryPartition]]:
+ """
+ Returns a mapping of assigned RecoveryPartitions in the following format:
+ {: {: }}
+ """
+ return self._recovery_partitions
+
@property
def has_assignments(self) -> bool:
"""
@@ -211,7 +256,9 @@ def recovering(self) -> bool:
"""
return self.has_assignments and self._running
- def register_changelog(self, topic_name: str, store_name: str, consumer_group: str):
+ def register_changelog(
+ self, topic_name: str, store_name: str, consumer_group: str
+ ) -> Topic:
"""
Register a changelog Topic with the TopicManager.
@@ -252,29 +299,37 @@ def _generate_recovery_partitions(
topic_name: str,
partition_num: int,
store_partitions: Dict[str, StorePartition],
+ committed_offset: int,
) -> List[RecoveryPartition]:
- recovery_partitions = [
- RecoveryPartition(
- changelog_name=self._topic_manager.changelog_topics[topic_name][
- store_name
- ].name,
+ partitions = []
+ for store_name, store_partition in store_partitions.items():
+ changelog_topic = self._topic_manager.changelog_topics[topic_name][
+ store_name
+ ]
+ recovery_partition = RecoveryPartition(
+ changelog_name=changelog_topic.name,
partition_num=partition_num,
store_partition=store_partition,
+ committed_offset=committed_offset,
)
- for store_name, store_partition in store_partitions.items()
- ]
- for rp in recovery_partitions:
- rp.set_watermarks(
- *self._consumer.get_watermark_offsets(
- ConfluentPartition(rp.changelog_name, rp.partition_num), timeout=10
- )
+
+ lowwater, highwater = self._consumer.get_watermark_offsets(
+ ConfluentPartition(
+ topic=recovery_partition.changelog_name,
+ partition=recovery_partition.partition_num,
+ ),
+ timeout=10,
)
- return recovery_partitions
+ recovery_partition.set_watermarks(lowwater=lowwater, highwater=highwater)
+
+ partitions.append(recovery_partition)
+ return partitions
def assign_partition(
self,
- topic_name: str,
- partition_num: int,
+ topic: str,
+ partition: int,
+ committed_offset: int,
store_partitions: Dict[str, StorePartition],
):
"""
@@ -283,32 +338,36 @@ def assign_partition(
Pauses active consumer partitions as needed.
"""
recovery_partitions = self._generate_recovery_partitions(
- topic_name=topic_name,
- partition_num=partition_num,
+ topic_name=topic,
+ partition_num=partition,
store_partitions=store_partitions,
+ committed_offset=committed_offset,
)
for rp in recovery_partitions:
- c_name, p_num = rp.changelog_name, rp.partition_num
+ changelog_name, partition = rp.changelog_name, rp.partition_num
if rp.needs_recovery:
- logger.info(f"Recovery required for {c_name}[{p_num}]")
- self._recovery_partitions.setdefault(p_num, {})[c_name] = rp
+ logger.info(f"Recovery required for {changelog_name}[{partition}]")
+ self._recovery_partitions.setdefault(partition, {})[changelog_name] = rp
self._consumer.incremental_assign(
- [ConfluentPartition(c_name, p_num, rp.offset)]
+ [ConfluentPartition(changelog_name, partition, rp.offset)]
)
elif rp.needs_offset_update:
# nothing to recover, but offset is off...likely that offset >
# highwater due to At Least Once processing behavior.
rp.update_offset()
- # figure out if any pausing is required
- if self.recovering:
- # was already recovering, so pause source topic only
- self._consumer.pause([ConfluentPartition(topic_name, partition_num)])
- logger.info("Continuing recovery...")
- elif self.has_assignments:
- # pause ALL partitions while we wait for Application to start recovery
- # (all newly assigned partitions are available on `.assignment`).
- self._consumer.pause(self._consumer.assignment())
+ # Figure out if we need to pause any topic partitions
+ if self._recovery_partitions:
+ if self._running:
+ # Some partitions are already recovering,
+ # pausing only the source topic partition
+ self._consumer.pause(
+ [ConfluentPartition(topic=topic, partition=partition)]
+ )
+ else:
+ # Recovery hasn't started yet, so pause ALL partitions
+ # and wait for Application to start recovery
+ self._consumer.pause(self._consumer.assignment())
def _revoke_recovery_partitions(
self,
diff --git a/quixstreams/state/rocksdb/exceptions.py b/quixstreams/state/rocksdb/exceptions.py
index 502a96336..008f44b6c 100644
--- a/quixstreams/state/rocksdb/exceptions.py
+++ b/quixstreams/state/rocksdb/exceptions.py
@@ -3,10 +3,10 @@
__all__ = (
"StateSerializationError",
"StateTransactionError",
- "NestedPrefixError",
"ColumnFamilyDoesNotExist",
"ColumnFamilyAlreadyExists",
"ColumnFamilyHeaderMissing",
+ "InvalidChangelogOffset",
)
@@ -19,9 +19,6 @@ class StateSerializationError(StateError): ...
class StateTransactionError(StateError): ...
-class NestedPrefixError(StateError): ...
-
-
class ColumnFamilyDoesNotExist(StateError): ...
@@ -29,3 +26,6 @@ class ColumnFamilyAlreadyExists(StateError): ...
class ColumnFamilyHeaderMissing(StateError): ...
+
+
+class InvalidChangelogOffset(StateError): ...
diff --git a/quixstreams/state/rocksdb/metadata.py b/quixstreams/state/rocksdb/metadata.py
index f7683902b..948469f0d 100644
--- a/quixstreams/state/rocksdb/metadata.py
+++ b/quixstreams/state/rocksdb/metadata.py
@@ -6,3 +6,4 @@
METADATA_CF_NAME = "__metadata__"
CHANGELOG_CF_MESSAGE_HEADER = "__column_family__"
+CHANGELOG_PROCESSED_OFFSET_MESSAGE_HEADER = "__processed_tp_offset__"
diff --git a/quixstreams/state/rocksdb/partition.py b/quixstreams/state/rocksdb/partition.py
index 800a48a1f..546500c78 100644
--- a/quixstreams/state/rocksdb/partition.py
+++ b/quixstreams/state/rocksdb/partition.py
@@ -5,8 +5,8 @@
from rocksdict import WriteBatch, Rdict, ColumnFamily, AccessType
from quixstreams.models import ConfluentKafkaMessageProto
+from quixstreams.utils.json import loads as json_loads
from quixstreams.state.recovery import ChangelogProducer
-from quixstreams.models.types import MessageHeadersMapping
from quixstreams.state.types import (
StorePartition,
)
@@ -20,6 +20,7 @@
PROCESSED_OFFSET_KEY,
CHANGELOG_OFFSET_KEY,
CHANGELOG_CF_MESSAGE_HEADER,
+ CHANGELOG_PROCESSED_OFFSET_MESSAGE_HEADER,
)
from .options import RocksDBOptions
from .serialization import (
@@ -33,7 +34,6 @@
__all__ = ("RocksDBStorePartition",)
-
logger = logging.getLogger(__name__)
@@ -46,7 +46,6 @@ class RocksDBStorePartition(StorePartition):
1. Managing access to the RocksDB instance
2. Creating transactions to interact with data
3. Flushing WriteBatches to the RocksDB
- 4. Producing state-related changelog messages
It opens the RocksDB on `__init__`. If the db is locked by another process,
it will retry according to `open_max_retries` and `open_retry_backoff` options.
@@ -73,10 +72,6 @@ def __init__(
self._cf_handle_cache: Dict[str, ColumnFamily] = {}
self._changelog_producer = changelog_producer
- @property
- def using_changelogs(self) -> bool:
- return bool(self._changelog_producer)
-
def begin(
self,
) -> RocksDBPartitionTransaction:
@@ -90,6 +85,7 @@ def begin(
partition=self,
dumps=self._dumps,
loads=self._loads,
+ changelog_producer=self._changelog_producer,
)
def _changelog_recover_flush(self, changelog_offset: int, batch: WriteBatch):
@@ -103,28 +99,71 @@ def _changelog_recover_flush(self, changelog_offset: int, batch: WriteBatch):
)
self.write(batch)
+ def _should_apply_changelog(
+ self, headers: Dict[str, bytes], committed_offset: int
+ ) -> bool:
+ """
+ Determine whether the changelog update should be skipped.
+
+ :param headers: changelog message headers
+ :param committed_offset: latest committed offset of the source topic partition
+ :return: True if update should be applied, else False.
+ """
+ # Parse the processed topic-partition-offset info from the changelog message
+ # headers to determine whether the update should be applied or skipped.
+ # It can be empty if the message was produced by the older version of the lib.
+ processed_offset_header = headers.get(
+ CHANGELOG_PROCESSED_OFFSET_MESSAGE_HEADER, b"null"
+ )
+ processed_offset = json_loads(processed_offset_header)
+ if processed_offset is not None:
+ # Skip recovering from the message if its processed offset is ahead of the
+ # current committed offset.
+ # This way it will recover to a consistent state if the checkpointing code
+ # produced the changelog messages but failed to commit
+ # the source topic offset.
+ return processed_offset < committed_offset
+ return True
+
def recover_from_changelog_message(
- self, changelog_message: ConfluentKafkaMessageProto
+ self, changelog_message: ConfluentKafkaMessageProto, committed_offset: int
):
"""
Updates state from a given changelog message.
+ The actual update may be skipped when both conditions are met:
+
+ - The changelog message has headers with the processed message offset.
+ - This processed offset is larger than the latest committed offset for the same
+ topic partition.
+
+ This way the state does not apply the state changes for not-yet-committed
+ messages and improves the state consistency guarantees.
+
:param changelog_message: A raw Confluent message read from a changelog topic.
+ :param committed_offset: latest committed offset for the partition
"""
- try:
- cf_handle = self.get_column_family_handle(
- changelog_message.headers()[0][1].decode()
- )
- except IndexError:
+ headers = dict(changelog_message.headers() or ())
+ # Parse the column family name from message headers
+ cf_name = headers.get(CHANGELOG_CF_MESSAGE_HEADER, b"").decode()
+ if not cf_name:
raise ColumnFamilyHeaderMissing(
- f"Header '{CHANGELOG_CF_MESSAGE_HEADER}' missing from changelog message!"
+ f"Header '{CHANGELOG_CF_MESSAGE_HEADER}' missing from changelog message"
)
+ cf_handle = self.get_column_family_handle(cf_name)
+
batch = WriteBatch(raw_mode=True)
- key = changelog_message.key()
- if value := changelog_message.value():
- batch.put(key, value, cf_handle)
- else:
- batch.delete(key, cf_handle)
+ # Determine whether the update should be applied or skipped based on the
+ # latest committed offset and processed offset from the changelog message header
+ if self._should_apply_changelog(
+ headers=headers, committed_offset=committed_offset
+ ):
+ key = changelog_message.key()
+ if value := changelog_message.value():
+ batch.put(key, value, cf_handle)
+ else:
+ batch.delete(key, cf_handle)
+
self._changelog_recover_flush(changelog_message.offset(), batch)
def set_changelog_offset(self, changelog_offset: int):
@@ -137,17 +176,6 @@ def set_changelog_offset(self, changelog_offset: int):
"""
self._changelog_recover_flush(changelog_offset, WriteBatch(raw_mode=True))
- def produce_to_changelog(
- self,
- key: bytes,
- value: Optional[bytes] = None,
- headers: Optional[MessageHeadersMapping] = None,
- ):
- """
- Produce a message to the StorePartitions respective changelog.
- """
- self._changelog_producer.produce(key=key, value=value, headers=headers)
-
def write(self, batch: WriteBatch):
"""
Write `WriteBatch` to RocksDB
@@ -212,8 +240,6 @@ def close(self):
self._cf_handle_cache = {}
self._cf_cache = {}
self._db.close()
- if self._changelog_producer:
- self._changelog_producer.flush()
logger.debug(f'Closed rocksdb partition on "{self._path}"')
@property
diff --git a/quixstreams/state/rocksdb/transaction.py b/quixstreams/state/rocksdb/transaction.py
index 52207f012..12c99753e 100644
--- a/quixstreams/state/rocksdb/transaction.py
+++ b/quixstreams/state/rocksdb/transaction.py
@@ -1,69 +1,59 @@
-import contextlib
import functools
import logging
-from typing import Any, Union, Optional, Dict, NewType, TYPE_CHECKING
-
-from rocksdict import WriteBatch, ColumnFamily
-from typing_extensions import Self
+from typing import Any, Union, Optional, Dict, NewType, TYPE_CHECKING, Tuple
+from rocksdict import WriteBatch
+from quixstreams.utils.json import dumps as json_dumps
+from quixstreams.state.recovery import ChangelogProducer
from quixstreams.state.types import (
DumpsFunc,
LoadsFunc,
PartitionTransaction,
+ PartitionTransactionStatus,
)
-from .exceptions import (
- NestedPrefixError,
- StateTransactionError,
-)
+from .exceptions import StateTransactionError, InvalidChangelogOffset
from .metadata import (
METADATA_CF_NAME,
PROCESSED_OFFSET_KEY,
CHANGELOG_OFFSET_KEY,
PREFIX_SEPARATOR,
CHANGELOG_CF_MESSAGE_HEADER,
+ CHANGELOG_PROCESSED_OFFSET_MESSAGE_HEADER,
)
-from .serialization import (
- serialize,
- deserialize,
- int_to_int64_bytes,
-)
+from .serialization import serialize, deserialize, int_to_int64_bytes
from ..state import TransactionState
if TYPE_CHECKING:
from .partition import RocksDBStorePartition
-logger = logging.getLogger(__name__)
+__all__ = ("RocksDBPartitionTransaction", "DEFAULT_PREFIX", "DELETED")
+logger = logging.getLogger(__name__)
Undefined = NewType("Undefined", object)
-_undefined = Undefined(object())
-_deleted = Undefined(object())
-
-_DEFAULT_PREFIX = b""
+UNDEFINED = Undefined(object())
+DELETED = Undefined(object())
+DEFAULT_PREFIX = b""
-__all__ = ("RocksDBPartitionTransaction",)
-
-def _validate_transaction_state(func):
+def _validate_transaction_status(*allowed: PartitionTransactionStatus):
"""
- Check that the state of `RocksDBTransaction` is valid before calling a method
+ Check that the status of `RocksDBTransaction` is valid before calling a method
"""
- @functools.wraps(func)
- def wrapper(*args, **kwargs):
- self: RocksDBPartitionTransaction = args[0]
- if self.failed:
- raise StateTransactionError(
- "Transaction is failed, create a new one to proceed"
- )
- if self.completed:
- raise StateTransactionError(
- "Transaction is already finished, create a new one to proceed"
- )
+ def wrapper(func):
+ @functools.wraps(func)
+ def _wrapper(tx: "RocksDBPartitionTransaction", *args, **kwargs):
+ if tx.status not in allowed:
+ raise StateTransactionError(
+ f"Invalid transaction status {tx.status}, " f"allowed: {allowed}"
+ )
+
+ return func(tx, *args, **kwargs)
- return func(*args, **kwargs)
+ return _wrapper
return wrapper
@@ -79,10 +69,9 @@ class RocksDBPartitionTransaction(PartitionTransaction):
Prefixing
*********
- `RocksDBTransaction` allows to set prefixes for the keys in the given code block
- using :meth:`with_prefix()` context manager.
- Normally, `StreamingDataFrame` class will use message keys as prefixes
- in order to namespace the stored keys across different messages.
+ Methods `get()`, `set()`, `delete()` and `exists()` methods require prefixes for
+ the keys.
+ Normally, the Kafka message keys are supposed to be used as prefixes.
Transactional properties
************************
@@ -92,7 +81,7 @@ class RocksDBPartitionTransaction(PartitionTransaction):
within the transaction before it's flushed (aka "read-your-own-writes" problem).
If any mutation fails during the transaction
- (e.g. we failed to write the updates to the RocksDB), the whole transaction
+ (e.g., failed to write the updates to the RocksDB), the whole transaction
will be marked as failed and cannot be used anymore.
In this case, a new `RocksDBTransaction` should be created.
@@ -103,12 +92,9 @@ class RocksDBPartitionTransaction(PartitionTransaction):
"_partition",
"_update_cache",
"_batch",
- "_prefix",
- "_failed",
- "_completed",
"_dumps",
"_loads",
- "_state",
+ "_status",
)
def __init__(
@@ -116,6 +102,7 @@ def __init__(
partition: "RocksDBStorePartition",
dumps: DumpsFunc,
loads: LoadsFunc,
+ changelog_producer: Optional[ChangelogProducer] = None,
):
"""
:param partition: instance of `RocksDBStatePartition` to be used for accessing
@@ -124,51 +111,18 @@ def __init__(
:param loads: a function to deserialize data from bytes.
"""
self._partition = partition
- self._update_cache: Dict[str, Dict[bytes, Union[bytes, Undefined]]] = {}
+ self._update_cache: Dict[
+ str, Dict[bytes, Dict[bytes, Union[bytes, Undefined]]]
+ ] = {"default": {}}
self._batch = WriteBatch(raw_mode=True)
- self._prefix = _DEFAULT_PREFIX
- self._failed = False
- self._completed = False
self._dumps = dumps
self._loads = loads
- self._state = TransactionState(transaction=self)
-
- @property
- def state(self) -> TransactionState:
- return self._state
-
- @contextlib.contextmanager
- def with_prefix(self, prefix: Any = b"") -> Self:
- """
- A context manager set the prefix for all keys in the scope.
-
- Normally, it's called by Streaming DataFrames engine to ensure that every
- message key is stored separately.
-
- The `with_prefix` calls should not be nested.
- Only one prefix can be set at a time.
-
- :param prefix: a prefix string to be used.
- Should be either `bytes` or object serializable to `bytes`
- by `dumps` function.
- The prefix doesn't need to contain the separator, it will be added
- automatically between the key and the prefix if the prefix
- is not empty.
- """
- if self._prefix != _DEFAULT_PREFIX:
- raise NestedPrefixError("The transaction already has a prefix")
- self._prefix = (
- prefix if isinstance(prefix, bytes) else self._serialize_value(prefix)
- )
-
- try:
- yield self
- finally:
- self._prefix = _DEFAULT_PREFIX
+ self._status = PartitionTransactionStatus.STARTED
+ self._changelog_producer = changelog_producer
- @_validate_transaction_state
+ @_validate_transaction_status(PartitionTransactionStatus.STARTED)
def get(
- self, key: Any, default: Any = None, cf_name: str = "default"
+ self, key: Any, prefix: bytes, default: Any = None, cf_name: str = "default"
) -> Optional[Any]:
"""
Get a key from the store.
@@ -179,6 +133,7 @@ def get(
It returns `None` if the key is not found and `default` is not provided.
:param key: a key to get from DB
+ :param prefix: a key prefix
:param default: value to return if the key is not present in the state.
It can be of any type.
:param cf_name: rocksdb column family name. Default - "default"
@@ -187,90 +142,154 @@ def get(
# First, check the update cache in case the value was previously written
# Use _undefined sentinel as default because the actual value can be "None"
- key_serialized = self._serialize_key(key)
- cached = self._update_cache.get(cf_name, {}).get(key_serialized, _undefined)
- if cached is _deleted:
+ key_serialized = self._serialize_key(key, prefix=prefix)
+ cached = (
+ self._update_cache.get(cf_name, {})
+ .get(prefix, {})
+ .get(key_serialized, UNDEFINED)
+ )
+ if cached is DELETED:
return default
- if cached is not _undefined:
+ if cached is not UNDEFINED:
return self._deserialize_value(cached)
# The value is not found in cache, check the db
- stored = self._partition.get(key_serialized, _undefined, cf_name=cf_name)
- if stored is not _undefined:
+ stored = self._partition.get(key_serialized, UNDEFINED, cf_name=cf_name)
+ if stored is not UNDEFINED:
return self._deserialize_value(stored)
return default
- @_validate_transaction_state
- def set(self, key: Any, value: Any, cf_name: str = "default"):
+ @_validate_transaction_status(PartitionTransactionStatus.STARTED)
+ def set(self, key: Any, value: Any, prefix: bytes, cf_name: str = "default"):
"""
Set a key to the store.
It first updates the key in the update cache.
:param key: key to store in DB
+ :param prefix: a key prefix
:param value: value to store in DB
:param cf_name: rocksdb column family name. Default - "default"
"""
- key_serialized = self._serialize_key(key)
- value_serialized = self._serialize_value(value)
-
try:
- cf_handle = self._partition.get_column_family_handle(cf_name)
- self._batch.put(key_serialized, value_serialized, cf_handle)
- self._update_cache.setdefault(cf_name, {})[
+ key_serialized = self._serialize_key(key, prefix=prefix)
+ value_serialized = self._serialize_value(value)
+ self._update_cache.setdefault(cf_name, {}).setdefault(prefix, {})[
key_serialized
] = value_serialized
except Exception:
- self._failed = True
+ self._status = PartitionTransactionStatus.FAILED
raise
- @_validate_transaction_state
- def delete(self, key: Any, cf_name: str = "default"):
+ @_validate_transaction_status(PartitionTransactionStatus.STARTED)
+ def delete(self, key: Any, prefix: bytes, cf_name: str = "default"):
"""
Delete a key from the store.
It first deletes the key from the update cache.
- :param key: key to delete from DB
+ :param key: a key to delete from DB
+ :param prefix: a key prefix
:param cf_name: rocksdb column family name. Default - "default"
"""
- key_serialized = self._serialize_key(key)
try:
- cf_handle = self._partition.get_column_family_handle(cf_name)
- self._batch.delete(key_serialized, cf_handle)
-
- if cf_name not in self._update_cache:
- self._update_cache[cf_name] = {}
- self._update_cache[cf_name][key_serialized] = _deleted
+ key_serialized = self._serialize_key(key, prefix=prefix)
+ self._update_cache.setdefault(cf_name, {}).setdefault(prefix, {})[
+ key_serialized
+ ] = DELETED
except Exception:
- self._failed = True
+ self._status = PartitionTransactionStatus.FAILED
raise
- @_validate_transaction_state
- def exists(self, key: Any, cf_name: str = "default") -> bool:
+ @_validate_transaction_status(PartitionTransactionStatus.STARTED)
+ def exists(self, key: Any, prefix: bytes, cf_name: str = "default") -> bool:
"""
Check if a key exists in the store.
It first looks up the key in the update cache.
:param key: a key to check in DB
+ :param prefix: a key prefix
:param cf_name: rocksdb column family name. Default - "default"
:return: `True` if the key exists, `False` otherwise.
"""
- key_serialized = self._serialize_key(key)
- cached = self._update_cache.get(cf_name, {}).get(key_serialized, _undefined)
- if cached is _deleted:
+ key_serialized = self._serialize_key(key, prefix=prefix)
+ cached = (
+ self._update_cache.get(cf_name, {})
+ .get(prefix, {})
+ .get(key_serialized, UNDEFINED)
+ )
+ if cached is DELETED:
return False
- if cached is not _undefined:
+ if cached is not UNDEFINED:
return True
return self._partition.exists(key_serialized, cf_name=cf_name)
+ @_validate_transaction_status(PartitionTransactionStatus.STARTED)
+ def prepare(self, processed_offset: int):
+ """
+ Produce changelog messages to the changelog topic for all changes accumulated
+ in this transaction and prepare transaction to flush its state to the state
+ store.
+
+ After successful `prepare()`, the transaction status is changed to PREPARED,
+ and it cannot receive updates anymore.
+
+ If changelog is disabled for this application, no updates will be produced
+ to the changelog topic.
+
+ :param processed_offset: the offset of the latest processed message
+ """
+ try:
+ self._produce_changelog(processed_offset=processed_offset)
+ self._status = PartitionTransactionStatus.PREPARED
+ except Exception:
+ self._status = PartitionTransactionStatus.FAILED
+ raise
+
+ @_validate_transaction_status(
+ PartitionTransactionStatus.STARTED, PartitionTransactionStatus.PREPARED
+ )
+ def flush(
+ self,
+ processed_offset: Optional[int] = None,
+ changelog_offset: Optional[int] = None,
+ ):
+ """
+ Flush the recent updates to the database.
+ It writes the WriteBatch to RocksDB and marks itself as finished.
+
+ If writing fails, the transaction is marked as failed and
+ cannot be used anymore.
+
+ >***NOTE:*** If no keys have been modified during the transaction
+ (i.e. no "set" or "delete" have been called at least once), it will
+ not flush ANY data to the database including the offset to optimize
+ I/O.
+
+ :param processed_offset: offset of the last processed message, optional.
+ :param changelog_offset: offset of the last produced changelog message,
+ optional.
+ """
+ try:
+ self._flush_state(
+ processed_offset=processed_offset, changelog_offset=changelog_offset
+ )
+ self._status = PartitionTransactionStatus.COMPLETE
+ except Exception:
+ self._status = PartitionTransactionStatus.FAILED
+ raise
+
+ @property
+ def status(self) -> PartitionTransactionStatus:
+ return self._status
+
@property
def completed(self) -> bool:
"""
@@ -283,7 +302,19 @@ def completed(self) -> bool:
:return: `True` if transaction is completed, `False` otherwise.
"""
- return self._completed
+ return self._status == PartitionTransactionStatus.COMPLETE
+
+ @property
+ def prepared(self) -> bool:
+ """
+ Check if the transaction is in PREPARED status.
+
+ Prepared transaction successfully flushed its changelog and cannot receive
+ updates anymore, but its state is not yet flushed to the disk
+
+ :return: `True` if transaction is prepared, `False` otherwise.
+ """
+ return self._status == PartitionTransactionStatus.PREPARED
@property
def failed(self) -> bool:
@@ -295,57 +326,125 @@ def failed(self) -> bool:
:return: `True` if transaction is failed, `False` otherwise.
"""
- return self._failed
-
- def _update_changelog(self, meta_cf_handle: ColumnFamily):
- logger.debug("Flushing state changes to the changelog topic...")
- offset = self._partition.get_changelog_offset() or 0
+ return self._status == PartitionTransactionStatus.FAILED
- for cf_name in self._update_cache:
- headers = {CHANGELOG_CF_MESSAGE_HEADER: cf_name}
- for k, v in self._update_cache[cf_name].items():
- self._partition.produce_to_changelog(
- key=k, value=v if v is not _deleted else None, headers=headers
- )
- offset += 1
+ @property
+ def changelog_topic_partition(self) -> Optional[Tuple[str, int]]:
+ """
+ Return the changelog topic-partition for the StorePartition of this transaction.
- self._batch.put(
- CHANGELOG_OFFSET_KEY, int_to_int64_bytes(offset), meta_cf_handle
- )
- logger.debug(f"Changelog offset set to {offset}")
+ Returns `None` if changelog_producer is not provided.
- @_validate_transaction_state
- def maybe_flush(self, offset: Optional[int] = None):
+ :return: (topic, partition) or None
"""
- Flush the recent updates to the database and empty the update cache.
- It writes the WriteBatch to RocksDB and marks itself as finished.
+ if self._changelog_producer is not None:
+ return (
+ self._changelog_producer.changelog_name,
+ self._changelog_producer.partition,
+ )
- If writing fails, the transaction will be also marked as "failed" and
- cannot be used anymore.
+ def as_state(self, prefix: Any = DEFAULT_PREFIX) -> TransactionState:
+ """
+ Create a one-time use `TransactionState` object with a limited CRUD interface
+ to be provided to `StreamingDataFrame` operations.
- >***NOTE:*** If no keys have been modified during the transaction
- (i.e. no "set" or "delete" have been called at least once), it will
- not flush ANY data to the database including the offset in order to optimize
- I/O.
+ The `TransactionState` will prefix all the keys with the supplied `prefix`
+ for all underlying operations.
- :param offset: offset of the last processed message, optional.
+ :param prefix: a prefix to be used for all keys
+ :return: an instance of `TransactionState`
"""
- try:
- # Don't write batches if this transaction doesn't change any keys
- if len(self._batch):
- cf_handle = self._partition.get_column_family_handle(METADATA_CF_NAME)
- if offset is not None:
- self._batch.put(
- PROCESSED_OFFSET_KEY, int_to_int64_bytes(offset), cf_handle
+ return TransactionState(
+ transaction=self,
+ prefix=(
+ prefix
+ if isinstance(prefix, bytes)
+ else serialize(prefix, dumps=self._dumps)
+ ),
+ )
+
+ def _produce_changelog(self, processed_offset: Optional[int] = None):
+ changelog_producer = self._changelog_producer
+ if changelog_producer is None:
+ return
+
+ changelog_topic, partition = (
+ changelog_producer.changelog_name,
+ changelog_producer.partition,
+ )
+ logger.debug(
+ f"Flushing state changes to the changelog topic "
+ f'topic_name="{changelog_topic}" '
+ f"partition={partition} "
+ f"processed_offset={processed_offset}"
+ )
+ # Iterate over the transaction update cache
+ for cf_name, cf_update_cache in self._update_cache.items():
+ source_tp_offset_header = json_dumps(processed_offset)
+ headers = {
+ CHANGELOG_CF_MESSAGE_HEADER: cf_name,
+ CHANGELOG_PROCESSED_OFFSET_MESSAGE_HEADER: source_tp_offset_header,
+ }
+ for _prefix, prefix_update_cache in cf_update_cache.items():
+ for key, value in prefix_update_cache.items():
+ # Produce changes to the changelog topic
+ self._changelog_producer.produce(
+ key=key,
+ value=value if value is not DELETED else None,
+ headers=headers,
)
- if self._partition.using_changelogs:
- self._update_changelog(cf_handle)
- self._partition.write(self._batch)
- except Exception:
- self._failed = True
- raise
- finally:
- self._completed = True
+
+ def _flush_state(
+ self,
+ processed_offset: Optional[int] = None,
+ changelog_offset: Optional[int] = None,
+ ):
+ meta_cf_handle = self._partition.get_column_family_handle(METADATA_CF_NAME)
+ # Iterate over the transaction update cache
+ for cf_name, cf_update_cache in self._update_cache.items():
+ cf_handle = self._partition.get_column_family_handle(cf_name)
+ for _prefix, prefix_update_cache in cf_update_cache.items():
+ for key, value in prefix_update_cache.items():
+ # Apply changes to the Writebatch
+ if value is DELETED:
+ self._batch.delete(key, cf_handle)
+ else:
+ self._batch.put(key, value, cf_handle)
+
+ if not len(self._batch):
+ # Exit early if transaction doesn't update anything
+ return
+
+ # Save the latest processed input topic offset
+ if processed_offset is not None:
+ self._batch.put(
+ PROCESSED_OFFSET_KEY,
+ int_to_int64_bytes(processed_offset),
+ meta_cf_handle,
+ )
+ # Save the latest changelog topic offset to know where to recover from
+ # It may be None if changelog topics are disabled
+ if changelog_offset is not None:
+ current_changelog_offset = self._partition.get_changelog_offset()
+ if (
+ current_changelog_offset is not None
+ and changelog_offset < current_changelog_offset
+ ):
+ raise InvalidChangelogOffset(
+ f"Cannot set changelog offset lower than already saved one"
+ )
+ self._batch.put(
+ CHANGELOG_OFFSET_KEY,
+ int_to_int64_bytes(changelog_offset),
+ meta_cf_handle,
+ )
+ logger.debug(
+ f"Flushing state changes to the disk "
+ f'path="{self._partition.path}" '
+ f"processed_offset={processed_offset} "
+ f"changelog_offset={changelog_offset}"
+ )
+ self._partition.write(self._batch)
def _serialize_value(self, value: Any) -> bytes:
return serialize(value, dumps=self._dumps)
@@ -353,14 +452,22 @@ def _serialize_value(self, value: Any) -> bytes:
def _deserialize_value(self, value: bytes) -> Any:
return deserialize(value, loads=self._loads)
- def _serialize_key(self, key: Any) -> bytes:
+ def _serialize_key(self, key: Any, prefix: bytes) -> bytes:
key_bytes = serialize(key, dumps=self._dumps)
- prefix = self._prefix + PREFIX_SEPARATOR if self._prefix else b""
+ prefix = prefix + PREFIX_SEPARATOR if prefix else b""
return prefix + key_bytes
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
- if exc_val is None and not self._failed:
- self.maybe_flush()
+ """
+ Note: with state transactions, context manager interface is meant
+ to be used mostly in unit tests.
+
+ Normally, the Checkpoint class is responsible for managing and flushing
+ the transactions.
+ """
+
+ if exc_val is None and not self.failed:
+ self.flush()
diff --git a/quixstreams/state/rocksdb/windowed/partition.py b/quixstreams/state/rocksdb/windowed/partition.py
index ffa22826e..579292e88 100644
--- a/quixstreams/state/rocksdb/windowed/partition.py
+++ b/quixstreams/state/rocksdb/windowed/partition.py
@@ -57,6 +57,7 @@ def begin(self) -> "WindowedRocksDBPartitionTransaction":
dumps=self._dumps,
loads=self._loads,
latest_timestamp_ms=self._latest_timestamp_ms,
+ changelog_producer=self._changelog_producer,
)
def set_latest_timestamp(self, timestamp_ms: int):
diff --git a/quixstreams/state/rocksdb/windowed/state.py b/quixstreams/state/rocksdb/windowed/state.py
index 2b068d003..6a1071732 100644
--- a/quixstreams/state/rocksdb/windowed/state.py
+++ b/quixstreams/state/rocksdb/windowed/state.py
@@ -7,15 +7,18 @@
class WindowedTransactionState(WindowedState):
- __slots__ = ("_transaction",)
+ __slots__ = ("_transaction", "_prefix")
- def __init__(self, transaction: "WindowedRocksDBPartitionTransaction"):
+ def __init__(
+ self, transaction: "WindowedRocksDBPartitionTransaction", prefix: bytes
+ ):
"""
A windowed state to be provided into `StreamingDataFrame` window functions.
:param transaction: instance of `WindowedRocksDBPartitionTransaction`
"""
self._transaction = transaction
+ self._prefix = prefix
def get_window(
self, start_ms: int, end_ms: int, default: Any = None
@@ -30,7 +33,7 @@ def get_window(
:return: value or None if the key is not found and `default` is not provided
"""
return self._transaction.get_window(
- start_ms=start_ms, end_ms=end_ms, default=default
+ start_ms=start_ms, end_ms=end_ms, default=default, prefix=self._prefix
)
def update_window(self, start_ms: int, end_ms: int, value: Any, timestamp_ms: int):
@@ -47,7 +50,11 @@ def update_window(self, start_ms: int, end_ms: int, value: Any, timestamp_ms: in
:param timestamp_ms: current message timestamp in milliseconds
"""
return self._transaction.update_window(
- start_ms=start_ms, end_ms=end_ms, timestamp_ms=timestamp_ms, value=value
+ start_ms=start_ms,
+ end_ms=end_ms,
+ timestamp_ms=timestamp_ms,
+ value=value,
+ prefix=self._prefix,
)
def get_latest_timestamp(self) -> int:
@@ -74,5 +81,5 @@ def expire_windows(
"latest timestamp".
"""
return self._transaction.expire_windows(
- duration_ms=duration_ms, grace_ms=grace_ms
+ duration_ms=duration_ms, grace_ms=grace_ms, prefix=self._prefix
)
diff --git a/quixstreams/state/rocksdb/windowed/transaction.py b/quixstreams/state/rocksdb/windowed/transaction.py
index d54eb717f..445332a86 100644
--- a/quixstreams/state/rocksdb/windowed/transaction.py
+++ b/quixstreams/state/rocksdb/windowed/transaction.py
@@ -2,6 +2,7 @@
from rocksdict import ReadOptions
+from quixstreams.state.recovery import ChangelogProducer
from .metadata import LATEST_EXPIRED_WINDOW_TIMESTAMP_KEY, LATEST_EXPIRED_WINDOW_CF_NAME
from .serialization import encode_window_key, encode_window_prefix, parse_window_key
from .state import WindowedTransactionState
@@ -10,8 +11,8 @@
LATEST_TIMESTAMP_KEY,
PREFIX_SEPARATOR,
)
-from ..partition import RocksDBPartitionTransaction
from ..serialization import int_to_int64_bytes, serialize
+from ..transaction import RocksDBPartitionTransaction, DELETED, DEFAULT_PREFIX
from ..types import LoadsFunc, DumpsFunc
if TYPE_CHECKING:
@@ -27,15 +28,26 @@ def __init__(
dumps: DumpsFunc,
loads: LoadsFunc,
latest_timestamp_ms: int,
+ changelog_producer: Optional[ChangelogProducer] = None,
):
- super().__init__(partition=partition, dumps=dumps, loads=loads)
+ super().__init__(
+ partition=partition,
+ dumps=dumps,
+ loads=loads,
+ changelog_producer=changelog_producer,
+ )
self._partition = cast("WindowedRocksDBStorePartition", self._partition)
- self._state = WindowedTransactionState(transaction=self)
self._latest_timestamp_ms = latest_timestamp_ms
- @property
- def state(self) -> "WindowedTransactionState":
- return self._state
+ def as_state(self, prefix: Any = DEFAULT_PREFIX) -> WindowedTransactionState:
+ return WindowedTransactionState(
+ transaction=self,
+ prefix=(
+ prefix
+ if isinstance(prefix, bytes)
+ else serialize(prefix, dumps=self._dumps)
+ ),
+ )
def get_latest_timestamp(self) -> int:
return self._latest_timestamp_ms
@@ -47,37 +59,51 @@ def _validate_duration(self, start_ms: int, end_ms: int):
f"than window start {start_ms}"
)
- def get_window(self, start_ms: int, end_ms: int, default: Any = None) -> Any:
+ def get_window(
+ self,
+ start_ms: int,
+ end_ms: int,
+ prefix: bytes,
+ default: Any = None,
+ ) -> Any:
self._validate_duration(start_ms=start_ms, end_ms=end_ms)
key = encode_window_key(start_ms, end_ms)
- return self.get(key=key, default=default)
+ return self.get(key=key, default=default, prefix=prefix)
- def update_window(self, start_ms: int, end_ms: int, value: Any, timestamp_ms: int):
+ def update_window(
+ self, start_ms: int, end_ms: int, value: Any, timestamp_ms: int, prefix: bytes
+ ):
if timestamp_ms < 0:
raise ValueError("Timestamp cannot be negative")
self._validate_duration(start_ms=start_ms, end_ms=end_ms)
key = encode_window_key(start_ms, end_ms)
- self.set(key=key, value=value)
+ self.set(key=key, value=value, prefix=prefix)
self._latest_timestamp_ms = max(self._latest_timestamp_ms, timestamp_ms)
- def delete_window(self, start_ms: int, end_ms: int):
+ def delete_window(self, start_ms: int, end_ms: int, prefix: bytes):
self._validate_duration(start_ms=start_ms, end_ms=end_ms)
key = encode_window_key(start_ms, end_ms)
- self.delete(key=key)
+ self.delete(key=key, prefix=prefix)
- def maybe_flush(self, offset: Optional[int] = None):
+ def flush(
+ self,
+ processed_offset: Optional[int] = None,
+ changelog_offset: Optional[int] = None,
+ ):
cf_handle = self._partition.get_column_family_handle(METADATA_CF_NAME)
self._batch.put(
LATEST_TIMESTAMP_KEY,
int_to_int64_bytes(self._latest_timestamp_ms),
cf_handle,
)
- super().maybe_flush(offset=offset)
+ super().flush(
+ processed_offset=processed_offset, changelog_offset=changelog_offset
+ )
self._partition.set_latest_timestamp(self._latest_timestamp_ms)
def expire_windows(
- self, duration_ms: int, grace_ms: int = 0
+ self, duration_ms: int, prefix: bytes, grace_ms: int = 0
) -> List[Tuple[Tuple[int, int], Any]]:
"""
Get a list of expired windows from RocksDB considering latest timestamp,
@@ -104,7 +130,8 @@ def expire_windows(
# Find the latest start timestamp of the expired windows for the given key
last_expired = self.get(
- LATEST_EXPIRED_WINDOW_TIMESTAMP_KEY,
+ key=LATEST_EXPIRED_WINDOW_TIMESTAMP_KEY,
+ prefix=prefix,
cf_name=LATEST_EXPIRED_WINDOW_CF_NAME,
)
if last_expired is not None:
@@ -115,30 +142,30 @@ def expire_windows(
expired_windows = self._get_windows(
start_from_ms=start_from,
start_to_ms=start_to,
+ prefix=prefix,
)
if expired_windows:
# Save the start of the latest expired window to the expiration index
latest_window = expired_windows[-1]
last_expired__gt = latest_window[0][0]
self.set(
- LATEST_EXPIRED_WINDOW_TIMESTAMP_KEY,
- last_expired__gt,
+ key=LATEST_EXPIRED_WINDOW_TIMESTAMP_KEY,
+ value=last_expired__gt,
+ prefix=prefix,
cf_name=LATEST_EXPIRED_WINDOW_CF_NAME,
)
# Delete expired windows from the state
for (start, end), _ in expired_windows:
- self.delete_window(start, end)
+ self.delete_window(start, end, prefix=prefix)
return expired_windows
- def _serialize_key(self, key: Any) -> bytes:
+ def _serialize_key(self, key: Any, prefix: bytes) -> bytes:
# Allow bytes keys in WindowedStore
key_bytes = key if isinstance(key, bytes) else serialize(key, dumps=self._dumps)
- return self._prefix + PREFIX_SEPARATOR + key_bytes
+ return prefix + PREFIX_SEPARATOR + key_bytes
def _get_windows(
- self,
- start_from_ms: int,
- start_to_ms: int,
+ self, start_from_ms: int, start_to_ms: int, prefix: bytes
) -> List[Tuple[Tuple[int, int], Any]]:
"""
Get all windows starting between "start_from" and "start_to"
@@ -156,11 +183,11 @@ def _get_windows(
# Iterate over rocksdb within the given prefix and (start_form, start_to)
# timestamps
seek_from = max(start_from_ms, 0)
- seek_from_key = encode_window_prefix(prefix=self._prefix, start_ms=seek_from)
+ seek_from_key = encode_window_prefix(prefix=prefix, start_ms=seek_from)
# Add +1 to make the "start_to" inclusive
seek_to = start_to_ms + 1
- seek_to_key = encode_window_prefix(prefix=self._prefix, start_ms=seek_to)
+ seek_to_key = encode_window_prefix(prefix=prefix, start_ms=seek_to)
# Set iterator bounds to reduce the potential IO
read_opt = ReadOptions()
@@ -172,18 +199,17 @@ def _get_windows(
read_opt=read_opt, from_key=seek_from_key
):
message_key, start, end = parse_window_key(key)
+ if start_from_ms < start <= start_to_ms:
+ windows[(start, end)] = self._deserialize_value(value)
- if message_key != self._prefix or start > start_to_ms:
- break
- elif start <= start_from_ms:
- continue
-
- windows[(start, end)] = self._deserialize_value(value)
-
- for window_key, window_value in self._update_cache.get("default", {}).items():
+ for window_key, window_value in (
+ self._update_cache["default"].get(prefix, {}).items()
+ ):
message_key, start, end = parse_window_key(window_key)
- if message_key != self._prefix or not start_from_ms < start <= start_to_ms:
+ if window_value is DELETED:
+ windows.pop((start, end), None)
continue
- windows[(start, end)] = self._deserialize_value(window_value)
+ elif start_from_ms < start <= start_to_ms:
+ windows[(start, end)] = self._deserialize_value(window_value)
return sorted(windows.items())
diff --git a/quixstreams/state/state.py b/quixstreams/state/state.py
index 7262cdcf1..3e06d9882 100644
--- a/quixstreams/state/state.py
+++ b/quixstreams/state/state.py
@@ -4,14 +4,18 @@
class TransactionState(State):
- __slots__ = ("_transaction",)
+ __slots__ = (
+ "_transaction",
+ "_prefix",
+ )
- def __init__(self, transaction: PartitionTransaction):
+ def __init__(self, prefix: bytes, transaction: PartitionTransaction):
"""
Simple key-value state to be provided into `StreamingDataFrame` functions
:param transaction: instance of `PartitionTransaction`
"""
+ self._prefix = prefix
self._transaction = transaction
def get(self, key: Any, default: Any = None) -> Optional[Any]:
@@ -22,7 +26,7 @@ def get(self, key: Any, default: Any = None) -> Optional[Any]:
:param default: default value to return if the key is not found
:return: value or None if the key is not found and `default` is not provided
"""
- return self._transaction.get(key=key, default=default)
+ return self._transaction.get(key=key, prefix=self._prefix, default=default)
def set(self, key: Any, value: Any):
"""
@@ -30,7 +34,7 @@ def set(self, key: Any, value: Any):
:param key: key
:param value: value
"""
- return self._transaction.set(key=key, value=value)
+ return self._transaction.set(key=key, value=value, prefix=self._prefix)
def delete(self, key: Any):
"""
@@ -39,7 +43,7 @@ def delete(self, key: Any):
This function always returns `None`, even if value is not found.
:param key: key
"""
- return self._transaction.delete(key=key)
+ return self._transaction.delete(key=key, prefix=self._prefix)
def exists(self, key: Any) -> bool:
"""
@@ -48,4 +52,4 @@ def exists(self, key: Any) -> bool:
:return: True if key exists, False otherwise
"""
- return self._transaction.exists(key=key)
+ return self._transaction.exists(key=key, prefix=self._prefix)
diff --git a/quixstreams/state/types.py b/quixstreams/state/types.py
index 25f9b447c..37aa3b6db 100644
--- a/quixstreams/state/types.py
+++ b/quixstreams/state/types.py
@@ -1,8 +1,5 @@
-import contextlib
-
-from typing import Protocol, Any, Optional, Iterator, Callable, Dict, ClassVar
-
-from typing_extensions import Self
+import enum
+from typing import Protocol, Any, Optional, Callable, Dict, ClassVar, Tuple
from quixstreams.models import ConfluentKafkaMessageProto
from quixstreams.models.types import MessageHeadersMapping
@@ -60,9 +57,7 @@ def revoke_partition(self, partition: int):
"""
...
- def start_partition_transaction(
- self, partition: int
- ) -> Optional["PartitionTransaction"]:
+ def start_partition_transaction(self, partition: int) -> "PartitionTransaction":
"""
Start a new partition transaction.
@@ -103,23 +98,13 @@ def begin(self) -> "PartitionTransaction":
"""
def recover_from_changelog_message(
- self, changelog_message: ConfluentKafkaMessageProto
+ self, changelog_message: ConfluentKafkaMessageProto, committed_offset: int
):
"""
Updates state from a given changelog message.
:param changelog_message: A raw Confluent message read from a changelog topic.
- """
- ...
-
- def produce_to_changelog(
- self,
- key: bytes,
- value: Optional[bytes] = None,
- headers: Optional[MessageHeadersMapping] = None,
- ):
- """
- Produce a message to the StorePartitions respective changelog.
+ :param committed_offset: latest committed offset for the partition
"""
...
@@ -189,17 +174,59 @@ def exists(self, key: Any) -> bool:
...
-class PartitionTransaction(State):
+class PartitionTransaction(Protocol):
"""
A transaction class to perform simple key-value operations like
"get", "set", "delete" and "exists" on a single storage partition.
"""
- @property
- def state(self) -> State:
+ def as_state(self, prefix: Any) -> State:
+ """
+ Create an instance implementing the `State` protocol to be provided
+ to `StreamingDataFrame` functions.
+ All operations called on this State object will be prefixed with
+ the supplied `prefix`.
+
+ :return: an instance implementing the `State` protocol
+ """
+ ...
+
+ def get(self, key: Any, prefix: bytes, default: Any = None) -> Optional[Any]:
+ """
+ Get the value for key if key is present in the state, else default
+
+ :param key: key
+ :param prefix: a key prefix
+ :param default: default value to return if the key is not found
+ :return: value or None if the key is not found and `default` is not provided
"""
- An instance of State to be provided to `StreamingDataFrame` functions
- :return:
+ ...
+
+ def set(self, key: Any, prefix: bytes, value: Any):
+ """
+ Set value for the key.
+ :param key: key
+ :param prefix: a key prefix
+ :param value: value
+ """
+ ...
+
+ def delete(self, key: Any, prefix: bytes):
+ """
+ Delete value for the key.
+
+ This function always returns `None`, even if value is not found.
+ :param key: key
+ :param prefix: a key prefix
+ """
+ ...
+
+ def exists(self, key: Any, prefix: bytes) -> bool:
+ """
+ Check if the key exists in state.
+ :param key: key
+ :param prefix: a key prefix
+ :return: True if key exists, False otherwise
"""
...
@@ -216,29 +243,59 @@ def failed(self) -> bool:
@property
def completed(self) -> bool:
"""
- Return `True` if transaction is completed.
+ Return `True` if transaction is successfully completed.
Completed transactions cannot be re-used.
:return: bool
"""
...
- @contextlib.contextmanager
- def with_prefix(self, prefix: Any = b"") -> Iterator[Self]:
+ @property
+ def prepared(self) -> bool:
"""
- A context manager set the prefix for all keys in the scope.
+ Return `True` if transaction is prepared completed.
- Normally, it's called by `StreamingDataFrame` internals to ensure that every
- message key is stored separately.
- :param prefix: key prefix
- :return: context manager
+ Prepared transactions cannot receive new updates, but can be flushed.
+ :return: bool
"""
...
- def maybe_flush(self, offset: Optional[int] = None):
+ def prepare(self, processed_offset: int):
"""
- Flush the recent updates and last processed offset to the storage.
- :param offset: offset of the last processed message, optional.
+ Produce changelog messages to the changelog topic for all changes accumulated
+ in this transaction and prepare transcation to flush its state to the state
+ store.
+
+ After successful `prepare()`, the transaction status is changed to PREPARED,
+ and it cannot receive updates anymore.
+
+ If changelog is disabled for this application, no updates will be produced
+ to the changelog topic.
+
+ :param processed_offset: the offset of the latest processed message
+ """
+
+ @property
+ def changelog_topic_partition(self) -> Optional[Tuple[str, int]]:
+ """
+ Return the changelog topic-partition for the StorePartition of this transaction.
+
+ Returns `None` if changelog_producer is not provided.
+
+ :return: (topic, partition) or None
+ """
+
+ def flush(
+ self,
+ processed_offset: Optional[int] = None,
+ changelog_offset: Optional[int] = None,
+ ):
+ """
+ Flush the recent updates to the storage.
+
+ :param processed_offset: offset of the last processed message, optional.
+ :param changelog_offset: offset of the last produced changelog message,
+ optional.
"""
def __enter__(self): ...
@@ -305,9 +362,7 @@ def expire_windows(self, duration_ms: int, grace_ms: int = 0):
...
-class WindowedPartitionTransaction(WindowedState):
- @property
- def state(self) -> WindowedState: ...
+class WindowedPartitionTransaction(Protocol):
@property
def failed(self) -> bool:
@@ -322,28 +377,123 @@ def failed(self) -> bool:
@property
def completed(self) -> bool:
"""
- Return `True` if transaction is completed.
+ Return `True` if transaction is successfully completed.
Completed transactions cannot be re-used.
:return: bool
"""
...
- def with_prefix(self, prefix: Any = b"") -> Iterator[Self]:
+ @property
+ def prepared(self) -> bool:
+ """
+ Return `True` if transaction is prepared completed.
+
+ Prepared transactions cannot receive new updates, but can be flushed.
+ :return: bool
+ """
+ ...
+
+ def prepare(self, processed_offset: int):
+ """
+ Produce changelog messages to the changelog topic for all changes accumulated
+ in this transaction and prepare transcation to flush its state to the state
+ store.
+
+ After successful `prepare()`, the transaction status is changed to PREPARED,
+ and it cannot receive updates anymore.
+
+ If changelog is disabled for this application, no updates will be produced
+ to the changelog topic.
+
+ :param processed_offset: the offset of the latest processed message
+ """
+
+ def as_state(self, prefix: Any) -> WindowedState: ...
+
+ def get_window(
+ self,
+ start_ms: int,
+ end_ms: int,
+ prefix: bytes,
+ default: Any = None,
+ ) -> Optional[Any]:
+ """
+ Get the value of the window defined by `start` and `end` timestamps
+ if the window is present in the state, else default
+
+ :param start_ms: start of the window in milliseconds
+ :param end_ms: end of the window in milliseconds
+ :param prefix: a key prefix
+ :param default: default value to return if the key is not found
+ :return: value or None if the key is not found and `default` is not provided
+ """
+ ...
+
+ def update_window(
+ self, start_ms: int, end_ms: int, value: Any, timestamp_ms: int, prefix: bytes
+ ):
"""
- A context manager set the prefix for all keys in the scope.
+ Set a value for the window.
- Normally, it's called by `StreamingDataFrame` internals to ensure that every
- message key is stored separately.
- :param prefix: key prefix
- :return: context manager
+ This method will also update the latest observed timestamp in state partition
+ using the provided `timestamp`.
+
+ :param start_ms: start of the window in milliseconds
+ :param end_ms: end of the window in milliseconds
+ :param value: value of the window
+ :param timestamp_ms: current message timestamp in milliseconds
+ :param prefix: a key prefix
"""
...
- def maybe_flush(self, offset: Optional[int] = None):
+ def get_latest_timestamp(self) -> int:
"""
- Flush the recent updates and last processed offset to the storage.
- :param offset: offset of the last processed message, optional.
+ Get the latest observed timestamp for the current state partition.
+
+ Use this timestamp to determine if the arriving event is late and should be
+ discarded from the processing.
+
+ :return: latest observed event timestamp in milliseconds
+ """
+ ...
+
+ def expire_windows(self, duration_ms: int, prefix: bytes, grace_ms: int = 0):
+ """
+ Get a list of expired windows from RocksDB considering the current
+ latest timestamp, window duration and grace period.
+
+ It also marks the latest found window as expired in the expiration index, so
+ calling this method multiple times will yield different results for the same
+ "latest timestamp".
+
+ :param duration_ms: duration of the windows in milliseconds
+ :param prefix: a key prefix
+ :param grace_ms: grace period in milliseconds. Default - "0"
+ """
+ ...
+
+ def flush(
+ self,
+ processed_offset: Optional[int] = None,
+ changelog_offset: Optional[int] = None,
+ ):
+ """
+ Flush the recent updates to the storage.
+
+ :param processed_offset: offset of the last processed message, optional.
+ :param changelog_offset: offset of the last produced changelog message,
+ optional.
+ """
+
+ @property
+ def changelog_topic_partition(self) -> Optional[Tuple[str, int]]:
+ """
+ Return the changelog topic-partition for the StorePartition of this transaction.
+
+ Returns `None` if changelog_producer is not provided.
+
+ :return: (topic, partition) or None
"""
def __enter__(self): ...
@@ -360,6 +510,17 @@ def write_from_changelog_message(self): ...
def flush(self):
"""
- Flush the recovery update and last processed offset to the storage.
+ Flush the recovery update to the storage.
"""
...
+
+
+class PartitionTransactionStatus(enum.Enum):
+ STARTED = 1 # Transaction is started and accepts updates
+
+ PREPARED = 2 # Transaction is prepared, it can no longer receive updates
+ # and can only be flushed
+
+ COMPLETE = 3 # Transaction is fully completed, it cannot be used anymore
+
+ FAILED = 4 # Transaction is failed, it cannot be used anymore
diff --git a/tests/test_quixstreams/fixtures.py b/tests/test_quixstreams/fixtures.py
index 636035115..e3698a241 100644
--- a/tests/test_quixstreams/fixtures.py
+++ b/tests/test_quixstreams/fixtures.py
@@ -240,6 +240,11 @@ def factory(
return factory
+@pytest.fixture()
+def row_producer(row_producer_factory):
+ return row_producer_factory()
+
+
@pytest.fixture()
def row_factory():
"""
@@ -276,6 +281,7 @@ def app_factory(kafka_container, random_consumer_group, tmp_path):
def factory(
consumer_group: Optional[str] = None,
auto_offset_reset: AutoOffsetReset = "latest",
+ commit_interval: float = 5.0,
consumer_extra_config: Optional[dict] = None,
producer_extra_config: Optional[dict] = None,
on_consumer_error: Optional[ConsumerErrorCallback] = None,
@@ -292,6 +298,7 @@ def factory(
broker_address=kafka_container.broker_address,
consumer_group=consumer_group or random_consumer_group,
auto_offset_reset=auto_offset_reset,
+ commit_interval=commit_interval,
consumer_extra_config=consumer_extra_config,
producer_extra_config=producer_extra_config,
on_consumer_error=on_consumer_error,
@@ -335,21 +342,6 @@ def state_manager(state_manager_factory) -> StateStoreManager:
manager.close()
-@pytest.fixture()
-def state_manager_changelogs(
- state_manager_factory,
- topic_admin,
- recovery_manager_mock_consumer,
-) -> StateStoreManager:
- manager = state_manager_factory(
- producer=create_autospec(RowProducer)("broker"),
- recovery_manager=recovery_manager_mock_consumer,
- )
- manager.init()
- yield manager
- manager.close()
-
-
@pytest.fixture()
def quix_mock_config_builder_factory(kafka_container):
def factory(workspace_id: Optional[str] = None):
diff --git a/tests/test_quixstreams/test_app.py b/tests/test_quixstreams/test_app.py
index 1d7c9e2e4..ee9ec4e2f 100644
--- a/tests/test_quixstreams/test_app.py
+++ b/tests/test_quixstreams/test_app.py
@@ -1,3 +1,4 @@
+import contextlib
import logging
import os
import time
@@ -12,23 +13,20 @@
from quixstreams.app import Application
from quixstreams.dataframe import StreamingDataFrame
from quixstreams.dataframe.windows.base import get_window_ranges
+from quixstreams.exceptions import PartitionAssignmentError
+from quixstreams.kafka.exceptions import KafkaConsumerException
from quixstreams.models import (
DoubleDeserializer,
DoubleSerializer,
JSONDeserializer,
SerializationError,
JSONSerializer,
+ TopicConfig,
)
-from quixstreams.platforms.quix import (
- QuixKafkaConfigsBuilder,
-)
+from quixstreams.platforms.quix import QuixKafkaConfigsBuilder
from quixstreams.platforms.quix.env import QuixEnvironment
-from quixstreams.rowconsumer import (
- KafkaMessageError,
- RowConsumer,
-)
+from quixstreams.rowconsumer import RowConsumer
from quixstreams.state import State
-from tests.utils import TopicPartitionStub
def _stop_app_on_future(app: Application, future: Future, timeout: float):
@@ -94,7 +92,7 @@ def test_produce_and_consume(self, app_factory, topic_factory):
for msg in consumed_messages:
assert msg in messages_to_produce
- def test_run_consume_and_produce(
+ def test_run_success(
self,
app_factory,
row_consumer_factory,
@@ -173,6 +171,61 @@ def on_message_processed(topic_, partition, offset):
assert row.key == data["key"]
assert row.value == {column_name: loads(data["value"].decode())}
+ def test_run_fails_no_commit(
+ self,
+ app_factory,
+ row_consumer_factory,
+ executor,
+ row_factory,
+ ):
+ """
+ Test that Application doesn't commit the checkpoint in case of failure
+ """
+
+ app = app_factory(
+ auto_offset_reset="earliest",
+ commit_interval=9999, # Set a high commit interval to ensure no autocommit
+ )
+
+ partition_num = 0
+ topic_in = app.topic(str(uuid.uuid4()))
+
+ def count_and_fail(_):
+ # Count the incoming messages and fail on processing the last one
+ nonlocal processed_count
+
+ processed_count += 1
+ # Stop processing after consuming all the messages
+ if processed_count == total_messages:
+ raise ValueError("test")
+
+ sdf = app.dataframe(topic_in).apply(count_and_fail)
+
+ processed_count = 0
+ total_messages = 3
+ # Produce messages to the topic and flush
+ data = {"key": b"key", "value": b'"value"', "partition": partition_num}
+ with app.get_producer() as producer:
+ for _ in range(total_messages):
+ producer.produce(topic_in.name, **data)
+
+ failed = Future()
+
+ # Stop app when the future is resolved
+ executor.submit(_stop_app_on_future, app, failed, 10.0)
+ with pytest.raises(ValueError):
+ app.run(sdf)
+
+ # Check that all messages have been processed
+ assert processed_count == total_messages
+
+ # Ensure the offset is not committed to Kafka
+ with row_consumer_factory() as row_consumer:
+ committed, *_ = row_consumer.committed(
+ [TopicPartition(topic_in.name, partition_num)]
+ )
+ assert committed.offset == -1001
+
def test_run_consumer_error_raised(self, app_factory, executor):
# Set "auto_offset_reset" to "error" to simulate errors in Consumer
app = app_factory(auto_offset_reset="error")
@@ -183,7 +236,7 @@ def test_run_consumer_error_raised(self, app_factory, executor):
# Stop app after 10s if nothing failed
executor.submit(_stop_app_on_timeout, app, 10.0)
- with pytest.raises(KafkaMessageError):
+ with pytest.raises(KafkaConsumerException):
app.run(sdf)
def test_run_deserialization_error_raised(self, app_factory, executor):
@@ -438,11 +491,11 @@ def test_producer_extra_config(self, app_factory):
Test that producer receives the Application extra configs
"""
app = app_factory(
- producer_extra_config={"max.in.flight": "123"},
+ producer_extra_config={"linger.ms": 10},
)
with app.get_producer() as x:
- assert x._producer_config["max.in.flight"] is "123"
+ assert x._producer_config["linger.ms"] == 10
def test_missing_broker_id_raise(self):
# confirm environment is empty
@@ -474,120 +527,174 @@ def test_consumer_group_default(self):
class TestQuixApplication:
def test_init_with_quix_sdk_token_arg(self):
- def cfg():
- return {
- "sasl.mechanisms": "SCRAM-SHA-256",
- "security.protocol": "SASL_SSL",
- "bootstrap.servers": "address1,address2",
- "sasl.username": "my-username",
- "sasl.password": "my-password",
- "ssl.ca.location": "/mock/dir/ca.cert",
- }
-
consumer_group = "c_group"
expected_workspace_cgroup = f"my_ws-{consumer_group}"
quix_sdk_token = "my_sdk_token"
+ broker_address = "address1,address2"
+
+ extra_config = {"extra": "config"}
+ auth_params = {
+ "sasl.mechanisms": "SCRAM-SHA-256",
+ "security.protocol": "SASL_SSL",
+ "sasl.username": "my-username",
+ "sasl.password": "my-password",
+ "ssl.ca.location": "/mock/dir/ca.cert",
+ }
+ confluent_broker_config = {
+ **auth_params,
+ "bootstrap.servers": broker_address,
+ }
+ expected_producer_extra_config = {
+ "enable.idempotence": True,
+ **auth_params,
+ **extra_config,
+ }
+ expected_consumer_extra_config = {**auth_params, **extra_config}
def get_cfg_builder(quix_sdk_token):
cfg_builder = create_autospec(QuixKafkaConfigsBuilder)
- cfg_builder.get_confluent_broker_config.side_effect = cfg
+ cfg_builder.get_confluent_broker_config.return_value = (
+ confluent_broker_config
+ )
cfg_builder.prepend_workspace_id.return_value = expected_workspace_cgroup
cfg_builder.quix_sdk_token = quix_sdk_token
return cfg_builder
- with patch("quixstreams.app.QuixKafkaConfigsBuilder", get_cfg_builder):
- app = Application(
+ # Mock consumer and producer to check the init args
+ with patch("quixstreams.app.QuixKafkaConfigsBuilder", get_cfg_builder), patch(
+ "quixstreams.app.RowConsumer"
+ ) as consumer_init_mock, patch(
+ "quixstreams.app.RowProducer"
+ ) as producer_init_mock:
+ Application(
consumer_group=consumer_group,
quix_sdk_token=quix_sdk_token,
- consumer_extra_config={"extra": "config"},
- producer_extra_config={"extra": "config"},
+ consumer_extra_config=extra_config,
+ producer_extra_config=extra_config,
)
# Check if items from the Quix config have been passed
# to the low-level configs of producer and consumer
- assert cfg().items() <= app._producer._producer_config.items()
- assert cfg().items() <= app._consumer._consumer_config.items()
-
- assert app._producer._producer_config["extra"] == "config"
- assert app._consumer._consumer_config["extra"] == "config"
- assert app._consumer._consumer_config["group.id"] == expected_workspace_cgroup
-
- def test_init_with_quix_sdk_token_env(self):
- def cfg():
- return {
- "sasl.mechanisms": "SCRAM-SHA-256",
- "security.protocol": "SASL_SSL",
- "bootstrap.servers": "address1,address2",
- "sasl.username": "my-username",
- "sasl.password": "my-password",
- "ssl.ca.location": "/mock/dir/ca.cert",
- }
+ producer_call_kwargs = producer_init_mock.call_args.kwargs
+ assert producer_call_kwargs["broker_address"] == broker_address
+ assert producer_call_kwargs["extra_config"] == expected_producer_extra_config
+ consumer_call_kwargs = consumer_init_mock.call_args.kwargs
+ assert consumer_call_kwargs["broker_address"] == broker_address
+ assert consumer_call_kwargs["consumer_group"] == expected_workspace_cgroup
+ assert consumer_call_kwargs["extra_config"] == expected_consumer_extra_config
+
+ def test_init_with_quix_sdk_token_env(self, monkeypatch):
consumer_group = "c_group"
expected_workspace_cgroup = f"my_ws-{consumer_group}"
quix_sdk_token = "my_sdk_token"
+ broker_address = "address1,address2"
+
+ extra_config = {"extra": "config"}
+ auth_params = {
+ "sasl.mechanisms": "SCRAM-SHA-256",
+ "security.protocol": "SASL_SSL",
+ "sasl.username": "my-username",
+ "sasl.password": "my-password",
+ "ssl.ca.location": "/mock/dir/ca.cert",
+ }
+ confluent_broker_config = {
+ **auth_params,
+ "bootstrap.servers": broker_address,
+ }
+ expected_producer_extra_config = {
+ "enable.idempotence": True,
+ **auth_params,
+ **extra_config,
+ }
+ expected_consumer_extra_config = {**auth_params, **extra_config}
def get_cfg_builder(quix_sdk_token):
cfg_builder = create_autospec(QuixKafkaConfigsBuilder)
- cfg_builder.get_confluent_broker_config.side_effect = cfg
+ cfg_builder.get_confluent_broker_config.return_value = (
+ confluent_broker_config
+ )
cfg_builder.prepend_workspace_id.return_value = expected_workspace_cgroup
cfg_builder.quix_sdk_token = quix_sdk_token
return cfg_builder
- with patch.dict(os.environ, {"Quix__Sdk__Token": quix_sdk_token}):
- with patch("quixstreams.app.QuixKafkaConfigsBuilder", get_cfg_builder):
- app = Application(
- consumer_group=consumer_group,
- consumer_extra_config={"extra": "config"},
- producer_extra_config={"extra": "config"},
- )
+ monkeypatch.setenv("Quix__Sdk__Token", quix_sdk_token)
+ with patch("quixstreams.app.QuixKafkaConfigsBuilder", get_cfg_builder), patch(
+ "quixstreams.app.RowConsumer"
+ ) as consumer_init_mock, patch(
+ "quixstreams.app.RowProducer"
+ ) as producer_init_mock:
+ Application(
+ consumer_group=consumer_group,
+ consumer_extra_config=extra_config,
+ producer_extra_config=extra_config,
+ )
# Check if items from the Quix config have been passed
# to the low-level configs of producer and consumer
- assert cfg().items() <= app._producer._producer_config.items()
- assert cfg().items() <= app._consumer._consumer_config.items()
+ producer_call_kwargs = producer_init_mock.call_args.kwargs
+ assert producer_call_kwargs["broker_address"] == broker_address
+ assert producer_call_kwargs["extra_config"] == expected_producer_extra_config
- assert app._producer._producer_config["extra"] == "config"
- assert app._consumer._consumer_config["extra"] == "config"
- assert app._consumer._consumer_config["group.id"] == expected_workspace_cgroup
+ consumer_call_kwargs = consumer_init_mock.call_args.kwargs
+ assert consumer_call_kwargs["broker_address"] == broker_address
+ assert consumer_call_kwargs["consumer_group"] == expected_workspace_cgroup
+ assert consumer_call_kwargs["extra_config"] == expected_consumer_extra_config
def test_init_with_quix_config_builder(self):
- def cfg():
- return {
- "sasl.mechanisms": "SCRAM-SHA-256",
- "security.protocol": "SASL_SSL",
- "bootstrap.servers": "address1,address2",
- "sasl.username": "my-username",
- "sasl.password": "my-password",
- "ssl.ca.location": "/mock/dir/ca.cert",
- }
-
consumer_group = "c_group"
expected_workspace_cgroup = f"my_ws-{consumer_group}"
quix_sdk_token = "my_sdk_token"
+ broker_address = "address1,address2"
+
+ extra_config = {"extra": "config"}
+ auth_params = {
+ "sasl.mechanisms": "SCRAM-SHA-256",
+ "security.protocol": "SASL_SSL",
+ "sasl.username": "my-username",
+ "sasl.password": "my-password",
+ "ssl.ca.location": "/mock/dir/ca.cert",
+ }
+ confluent_broker_config = {
+ **auth_params,
+ "bootstrap.servers": broker_address,
+ }
+ expected_producer_extra_config = {
+ "enable.idempotence": True,
+ **auth_params,
+ **extra_config,
+ }
+ expected_consumer_extra_config = {**auth_params, **extra_config}
def get_cfg_builder(quix_sdk_token):
cfg_builder = create_autospec(QuixKafkaConfigsBuilder)
- cfg_builder.get_confluent_broker_config.side_effect = cfg
+ cfg_builder.get_confluent_broker_config.return_value = (
+ confluent_broker_config
+ )
cfg_builder.prepend_workspace_id.return_value = expected_workspace_cgroup
cfg_builder.quix_sdk_token = quix_sdk_token
return cfg_builder
- app = Application(
- consumer_group=consumer_group,
- quix_config_builder=get_cfg_builder(quix_sdk_token),
- consumer_extra_config={"extra": "config"},
- producer_extra_config={"extra": "config"},
- )
+ with patch("quixstreams.app.RowConsumer") as consumer_init_mock, patch(
+ "quixstreams.app.RowProducer"
+ ) as producer_init_mock:
+ Application(
+ consumer_group=consumer_group,
+ quix_config_builder=get_cfg_builder(quix_sdk_token),
+ consumer_extra_config={"extra": "config"},
+ producer_extra_config={"extra": "config"},
+ )
# Check if items from the Quix config have been passed
# to the low-level configs of producer and consumer
- assert cfg().items() <= app._producer._producer_config.items()
- assert cfg().items() <= app._consumer._consumer_config.items()
+ producer_call_kwargs = producer_init_mock.call_args.kwargs
+ assert producer_call_kwargs["broker_address"] == broker_address
+ assert producer_call_kwargs["extra_config"] == expected_producer_extra_config
- assert app._producer._producer_config["extra"] == "config"
- assert app._consumer._consumer_config["extra"] == "config"
- assert app._consumer._consumer_config["group.id"] == expected_workspace_cgroup
+ consumer_call_kwargs = consumer_init_mock.call_args.kwargs
+ assert consumer_call_kwargs["broker_address"] == broker_address
+ assert consumer_call_kwargs["consumer_group"] == expected_workspace_cgroup
+ assert consumer_call_kwargs["extra_config"] == expected_consumer_extra_config
def test_init_with_broker_id_raises(self):
with pytest.raises(ValueError) as e_info:
@@ -680,35 +787,54 @@ class TestDeprecatedApplicationDotQuix:
"""
def test_init(self):
- def cfg():
- return {
- "sasl.mechanisms": "SCRAM-SHA-256",
- "security.protocol": "SASL_SSL",
- "bootstrap.servers": "address1,address2",
- "sasl.username": "my-username",
- "sasl.password": "my-password",
- "ssl.ca.location": "/mock/dir/ca.cert",
- }
+ consumer_group = "c_group"
+ expected_workspace_cgroup = f"my_ws-{consumer_group}"
+ broker_address = "address1,address2"
+
+ extra_config = {"extra": "config"}
+ auth_params = {
+ "sasl.mechanisms": "SCRAM-SHA-256",
+ "security.protocol": "SASL_SSL",
+ "sasl.username": "my-username",
+ "sasl.password": "my-password",
+ "ssl.ca.location": "/mock/dir/ca.cert",
+ }
+ confluent_broker_config = {
+ **auth_params,
+ "bootstrap.servers": broker_address,
+ }
+ expected_producer_extra_config = {
+ "enable.idempotence": True,
+ **auth_params,
+ **extra_config,
+ }
+ expected_consumer_extra_config = {**auth_params, **extra_config}
cfg_builder = create_autospec(QuixKafkaConfigsBuilder)
- cfg_builder.get_confluent_broker_config.side_effect = cfg
+ cfg_builder.get_confluent_broker_config.return_value = confluent_broker_config
cfg_builder.prepend_workspace_id.return_value = "my_ws-c_group"
cfg_builder.strip_workspace_id_prefix.return_value = "c_group"
- app = Application.Quix(
- quix_config_builder=cfg_builder,
- consumer_group="c_group",
- consumer_extra_config={"extra": "config"},
- producer_extra_config={"extra": "config"},
- )
+ with patch("quixstreams.app.RowConsumer") as consumer_init_mock, patch(
+ "quixstreams.app.RowProducer"
+ ) as producer_init_mock:
+ Application.Quix(
+ quix_config_builder=cfg_builder,
+ consumer_group="c_group",
+ consumer_extra_config={"extra": "config"},
+ producer_extra_config={"extra": "config"},
+ )
# Check if items from the Quix config have been passed
# to the low-level configs of producer and consumer
- assert cfg().items() <= app._producer._producer_config.items()
- assert cfg().items() <= app._consumer._consumer_config.items()
+ producer_call_kwargs = producer_init_mock.call_args.kwargs
+ assert producer_call_kwargs["broker_address"] == broker_address
+ assert producer_call_kwargs["extra_config"] == expected_producer_extra_config
+
+ consumer_call_kwargs = consumer_init_mock.call_args.kwargs
+ assert consumer_call_kwargs["broker_address"] == broker_address
+ assert consumer_call_kwargs["consumer_group"] == expected_workspace_cgroup
+ assert consumer_call_kwargs["extra_config"] == expected_consumer_extra_config
- assert app._producer._producer_config["extra"] == "config"
- assert app._consumer._consumer_config["extra"] == "config"
- assert app._consumer._consumer_config["group.id"] == "my_ws-c_group"
cfg_builder.prepend_workspace_id.assert_called_with("c_group")
def test_topic_name_and_config(self, app_dot_quix_factory):
@@ -843,15 +969,14 @@ def count(_, state: State):
)
state_manager.register_store(topic_in.name, "default")
state_manager.on_partition_assign(
- TopicPartitionStub(topic=topic_in.name, partition=partition_num)
+ topic=topic_in.name, partition=partition_num, committed_offset=-1001
)
store = state_manager.get_store(topic=topic_in.name, store_name="default")
with store.start_partition_transaction(partition=partition_num) as tx:
# All keys in state must be prefixed with the message key
- with tx.with_prefix(message_key):
- assert tx.get("total") == total_consumed.result()
+ assert tx.get("total", prefix=message_key) == total_consumed.result()
- def test_run_stateful_processing_fails(
+ def test_run_stateful_fails_no_commit(
self,
app_factory,
executor,
@@ -860,39 +985,36 @@ def test_run_stateful_processing_fails(
):
consumer_group = str(uuid.uuid4())
state_dir = (tmp_path / "state").absolute()
- partition_num = 0
app = app_factory(
consumer_group=consumer_group,
auto_offset_reset="earliest",
state_dir=state_dir,
+ commit_interval=9999, # Set a high commit interval to ensure no autocommit
)
topic_in = app.topic(str(uuid.uuid4()), value_deserializer=JSONDeserializer())
# Define a function that counts incoming Rows using state
- def count(_, state: State):
+ def count_and_fail(_, state: State):
total = state.get("total", 0)
total += 1
state.set("total", total)
+ # Fail after processing all messages
+ if total == total_messages:
+ raise ValueError("test")
failed = Future()
- def fail(*_):
- failed.set_result(True)
- raise ValueError("test")
-
- sdf = app.dataframe(topic_in).update(count, stateful=True).update(fail)
+ sdf = app.dataframe(topic_in).update(count_and_fail, stateful=True)
total_messages = 3
# Produce messages to the topic and flush
- data = {
- "key": b"key",
- "value": dumps({"key": "value"}),
- "partition": partition_num,
- }
+ key = b"key"
+ value = dumps({"key": "value"})
+
with app.get_producer() as producer:
for _ in range(total_messages):
- producer.produce(topic_in.name, **data)
+ producer.produce(topic_in.name, key=key, value=value)
# Stop app when the future is resolved
executor.submit(_stop_app_on_future, app, failed, 10.0)
@@ -905,11 +1027,11 @@ def fail(*_):
)
state_manager.register_store(topic_in.name, "default")
state_manager.on_partition_assign(
- TopicPartitionStub(topic=topic_in.name, partition=partition_num)
+ topic=topic_in.name, partition=0, committed_offset=-1001
)
store = state_manager.get_store(topic=topic_in.name, store_name="default")
- with store.start_partition_transaction(partition=partition_num) as tx:
- assert tx.get("total") is None
+ with store.start_partition_transaction(partition=0) as tx:
+ assert tx.get("total", prefix=key) is None
def test_run_stateful_suppress_processing_errors(
self,
@@ -969,12 +1091,11 @@ def fail(_):
)
state_manager.register_store(topic_in.name, "default")
state_manager.on_partition_assign(
- TopicPartitionStub(topic=topic_in.name, partition=partition_num)
+ topic=topic_in.name, partition=partition_num, committed_offset=-1001
)
store = state_manager.get_store(topic=topic_in.name, store_name="default")
with store.start_partition_transaction(partition=partition_num) as tx:
- with tx.with_prefix(message_key):
- assert tx.get("total") == total_consumed.result()
+ assert tx.get("total", prefix=message_key) == total_consumed.result()
def test_on_assign_topic_offset_behind_warning(
self,
@@ -1001,13 +1122,13 @@ def test_on_assign_topic_offset_behind_warning(
with state_manager:
state_manager.register_store(topic_in.name, "default")
state_partitions = state_manager.on_partition_assign(
- TopicPartitionStub(topic=topic_in.name, partition=partition_num)
+ topic=topic_in.name, partition=partition_num, committed_offset=-1001
)
- with state_manager.start_store_transaction(
- topic=topic_in.name, partition=partition_num, offset=9999
- ):
- tx = state_manager.get_store_transaction()
- tx.set("key", "value")
+ store = state_manager.get_store(topic_in.name, "default")
+ tx = store.start_partition_transaction(partition_num)
+ # Do some change to probe the Writebatch
+ tx.set("key", "value", prefix=b"__key__")
+ tx.flush(processed_offset=9999)
assert state_partitions[partition_num].get_processed_offset() == 9999
# Define some stateful function so the App assigns store partitions
@@ -1057,7 +1178,7 @@ def test_clear_state(
)
topic_in_name, _ = topic_factory()
- tx_prefix = b"key"
+ prefix = b"key"
state_manager = state_manager_factory(
group_id=consumer_group, state_dir=state_dir
@@ -1067,13 +1188,12 @@ def test_clear_state(
with state_manager:
state_manager.register_store(topic_in_name, "default")
state_manager.on_partition_assign(
- TopicPartitionStub(topic=topic_in_name, partition=0)
+ topic=topic_in_name, partition=0, committed_offset=-1001
)
store = state_manager.get_store(topic=topic_in_name, store_name="default")
with store.start_partition_transaction(partition=0) as tx:
# All keys in state must be prefixed with the message key
- with tx.with_prefix(tx_prefix):
- tx.set("my_state", True)
+ tx.set(key="my_state", value=True, prefix=prefix)
# Clear the state
app.clear_state()
@@ -1082,13 +1202,11 @@ def test_clear_state(
with state_manager:
state_manager.register_store(topic_in_name, "default")
state_manager.on_partition_assign(
- TopicPartitionStub(topic=topic_in_name, partition=0)
+ topic=topic_in_name, partition=0, committed_offset=-1001
)
store = state_manager.get_store(topic=topic_in_name, store_name="default")
with store.start_partition_transaction(partition=0) as tx:
- # All keys in state must be prefixed with the message key
- with tx.with_prefix(tx_prefix):
- assert tx.get("my_state") is None
+ assert tx.get("my_state", prefix=prefix) is None
def test_app_use_changelog_false(self, app_factory):
"""
@@ -1099,7 +1217,7 @@ def test_app_use_changelog_false(self, app_factory):
assert not app._state_manager.using_changelogs
-class TestAppRecovery:
+class TestApplicationRecovery:
def test_changelog_recovery_default_store(
self,
app_factory,
@@ -1130,14 +1248,18 @@ def sum_value(value: dict, state: State):
def get_app():
app = app_factory(
+ commit_interval=0, # Commit every processed message
auto_offset_reset="earliest",
- use_changelog_topics="True",
+ use_changelog_topics=True,
on_message_processed=on_message_processed,
consumer_group=consumer_group,
state_dir=state_dir,
)
topic = app.topic(
- topic_name, config=app._topic_manager.topic_config(num_partitions=2)
+ topic_name,
+ config=TopicConfig(
+ num_partitions=len(partition_msg_count), replication_factor=1
+ ),
)
sdf = app.dataframe(topic)
sdf = sdf.apply(sum_value, stateful=True)
@@ -1149,23 +1271,19 @@ def validate_state():
state_dir=state_dir,
) as state_manager:
state_manager.register_store(topic.name, store_name)
- for p_num in partition_msg_count:
+ for p_num, count in partition_msg_count.items():
state_manager.on_partition_assign(
- TopicPartitionStub(topic=topic.name, partition=p_num)
+ topic=topic.name, partition=p_num, committed_offset=-1001
)
- store = state_manager.get_store(topic=topic.name, store_name=store_name)
- for p_num, count in partition_msg_count.items():
- assert store._partitions[p_num].get_changelog_offset() == count
- with store.start_partition_transaction(partition=p_num) as tx:
- # All keys in state must be prefixed with the message key
- with tx.with_prefix(f"key{p_num}".encode()):
- assert tx.get(sum_key) == count * msg_int_value
-
- for p_num in partition_msg_count:
- state_manager.on_partition_revoke(
- TopicPartitionStub(topic=topic.name, partition=p_num)
+ store = state_manager.get_store(
+ topic=topic.name, store_name=store_name
)
- state_manager.clear_stores()
+ partition = store.partitions[p_num]
+ assert partition.get_changelog_offset() == count
+ with partition.begin() as tx:
+ # All keys in state must be prefixed with the message key
+ prefix = f"key{p_num}".encode()
+ assert tx.get(sum_key, prefix=prefix) == count * msg_int_value
# Produce messages to the topic and flush
app, sdf, topic = get_app()
@@ -1174,24 +1292,25 @@ def validate_state():
serialized = topic.serialize(
key=f"key{p_num}".encode(), value={"my_value": msg_int_value}
)
- data = {
- "key": serialized.key,
- "value": serialized.value,
- "partition": p_num,
- }
for _ in range(count):
- producer.produce(topic.name, **data)
+ producer.produce(
+ topic.name,
+ key=serialized.key,
+ value=serialized.value,
+ partition=p_num,
+ )
- # run app to populate state
+ # run app to populate state with data
done = Future()
executor.submit(_stop_app_on_future, app, done, 10.0)
app.run(sdf)
# validate and then delete the state
assert processed_count == partition_msg_count
- processed_count = {0: 0, 1: 0}
validate_state()
+ app.clear_state()
# run the app again and validate the recovered state
+ processed_count = {0: 0, 1: 0}
app, sdf, topic = get_app()
done = Future()
executor.submit(_stop_app_on_future, app, done, 10.0)
@@ -1253,14 +1372,18 @@ def on_message_processed(topic_, partition, offset):
def get_app():
app = app_factory(
+ commit_interval=0, # Commit every processed message
auto_offset_reset="earliest",
- use_changelog_topics="True",
+ use_changelog_topics=True,
consumer_group=consumer_group,
on_message_processed=on_message_processed,
state_dir=state_dir,
)
topic = app.topic(
- topic_name, config=app._topic_manager.topic_config(num_partitions=2)
+ topic_name,
+ config=TopicConfig(
+ num_partitions=len(partition_msg_count), replication_factor=1
+ ),
)
sdf = app.dataframe(topic)
sdf = sdf.apply(lambda row: row["my_value"])
@@ -1286,12 +1409,14 @@ def validate_state():
group_id=consumer_group, state_dir=state_dir
) as state_manager:
state_manager.register_windowed_store(topic.name, store_name)
- for p_num in partition_timestamps:
+ for p_num, windows in expected_window_updates.items():
state_manager.on_partition_assign(
- TopicPartitionStub(topic=topic.name, partition=p_num)
+ topic=topic.name, partition=p_num, committed_offset=-1001
)
- store = state_manager.get_store(topic=topic.name, store_name=store_name)
- for p_num, windows in expected_window_updates.items():
+ store = state_manager.get_store(
+ topic=topic.name, store_name=store_name
+ )
+
# in this test, each expiration check only deletes one window,
# simplifying the offset counting.
expected_offset = sum(
@@ -1299,25 +1424,21 @@ def validate_state():
) + 2 * len(expected_expired_windows[p_num])
assert (
expected_offset
- == store._partitions[p_num].get_changelog_offset()
+ == store.partitions[p_num].get_changelog_offset()
)
- with store.start_partition_transaction(partition=p_num) as tx:
- with tx.with_prefix(f"key{p_num}".encode()):
- for window, count in windows.items():
- expected = count
- if window in expected_expired_windows[p_num]:
- expected = None
- else:
- # each message value was 10
- expected *= msg_int_value
- assert tx.get_window(*window) == expected
-
- for p_num in partition_timestamps:
- state_manager.on_partition_revoke(
- TopicPartitionStub(topic=topic.name, partition=p_num)
- )
- state_manager.clear_stores()
+ partition = store.partitions[p_num]
+
+ with partition.begin() as tx:
+ prefix = f"key{p_num}".encode()
+ for window, count in windows.items():
+ expected = count
+ if window in expected_expired_windows[p_num]:
+ expected = None
+ else:
+ # each message value was 10
+ expected *= msg_int_value
+ assert tx.get_window(*window, prefix=prefix) == expected
app, sdf, topic = get_app()
# Produce messages to the topic and flush
@@ -1341,11 +1462,12 @@ def validate_state():
app.run(sdf)
# validate and then delete the state
assert processed_count == partition_msg_count
- processed_count = {0: 0, 1: 0}
validate_state()
# run the app again and validate the recovered state
+ processed_count = {0: 0, 1: 0}
app, sdf, topic = get_app()
+ app.clear_state()
done = Future()
executor.submit(_stop_app_on_future, app, done, 10.0)
app.run(sdf)
@@ -1353,3 +1475,131 @@ def validate_state():
assert processed_count == {0: 0, 1: 0}
# State should be the same as before deletion
validate_state()
+
+ def test_changelog_recovery_consistent_after_failed_commit(
+ self, app_factory, executor, tmp_path, state_manager_factory, consumer_factory
+ ):
+ """
+ Scenario: application processes messages and successfully produces changelog
+ messages but fails to commit the topic offsets.
+
+ We expect that the app will be recovered to a consistent state and changes
+ for the yet uncommitted messages will not be applied.
+ """
+ consumer_group = str(uuid.uuid4())
+ state_dir = (tmp_path / "state").absolute()
+ topic_name = str(uuid.uuid4())
+ store_name = "default"
+
+ # Messages to be processed successfully
+ succeeded_messages = [
+ ("key1", "1"),
+ ("key2", "2"),
+ ("key3", "3"),
+ ]
+ # Messages to fail
+ failed_messages = [
+ ("key1", "4"),
+ ("key2", "5"),
+ ("key3", "6"),
+ ]
+ # Ensure the same number of messages in both sets to simplift testing
+ assert len(failed_messages) == len(succeeded_messages)
+ total_count = len(succeeded_messages)
+ processed_count = 0
+
+ def on_message_processed(topic_, partition, offset):
+ nonlocal processed_count
+ # Set the callback to track total messages processed
+ # The callback is not triggered if processing fails
+ processed_count += 1
+ if processed_count == total_count:
+ done.set_result(True)
+
+ def get_app():
+ app = app_factory(
+ commit_interval=999, # Simulate a very long commit interval
+ auto_offset_reset="earliest",
+ use_changelog_topics=True,
+ on_message_processed=on_message_processed,
+ consumer_group=consumer_group,
+ state_dir=state_dir,
+ )
+ topic = app.topic(topic_name)
+ sdf = app.dataframe(topic)
+ sdf = sdf.update(
+ lambda value, state: state.set("latest", value["number"]), stateful=True
+ )
+ return app, sdf, topic
+
+ def validate_state():
+ with state_manager_factory(
+ group_id=consumer_group,
+ state_dir=state_dir,
+ ) as state_manager, consumer_factory(
+ consumer_group=consumer_group
+ ) as consumer:
+ committed_offset = consumer.committed(
+ [TopicPartition(topic=topic_name, partition=0)]
+ )[0].offset
+ state_manager.register_store(topic.name, store_name)
+ partition = state_manager.on_partition_assign(
+ topic=topic.name, partition=0, committed_offset=committed_offset
+ )[0]
+ with partition.begin() as tx:
+ for key, value in succeeded_messages:
+ state = tx.as_state(prefix=key.encode())
+ assert state.get("latest") == value
+
+ # Produce messages from the "succeded" set
+ app, sdf, topic = get_app()
+ with app.get_producer() as producer:
+ for key, value in succeeded_messages:
+ serialized = topic.serialize(key=key.encode(), value={"number": value})
+ producer.produce(topic.name, key=serialized.key, value=serialized.value)
+
+ # Run the application to apply changes to state
+ done = Future()
+ executor.submit(_stop_app_on_future, app, done, 10.0)
+ app.run(sdf)
+ assert processed_count == total_count
+ # Validate the state
+ validate_state()
+
+ # Init application again
+ processed_count = 0
+ app, sdf, topic = get_app()
+
+ # Produce messages from the "failed" set
+ with app.get_producer() as producer:
+ for key, value in failed_messages:
+ serialized = topic.serialize(key=key.encode(), value={"number": value})
+ producer.produce(topic.name, key=serialized.key, value=serialized.value)
+
+ # Run the app second time and fail the consumer commit
+ with patch.object(
+ RowConsumer, "commit", side_effect=ValueError("commit failed")
+ ):
+ done = Future()
+ executor.submit(_stop_app_on_future, app, done, 10.0)
+ with contextlib.suppress(PartitionAssignmentError):
+ app.run(sdf)
+
+ validate_state()
+
+ # Run the app again to recover the state
+ app, sdf, topic = get_app()
+ # Clear the state to recover from scratch
+ app.clear_state()
+
+ # Run app for the third time and fail on commit to prevent state changes
+ with patch.object(
+ RowConsumer, "commit", side_effect=ValueError("commit failed")
+ ):
+ done = Future()
+ executor.submit(_stop_app_on_future, app, done, 10.0)
+ with contextlib.suppress(PartitionAssignmentError):
+ app.run(sdf)
+
+ # The app should be recovered
+ validate_state()
diff --git a/tests/test_quixstreams/test_checkpointing.py b/tests/test_quixstreams/test_checkpointing.py
new file mode 100644
index 000000000..f56fa19e1
--- /dev/null
+++ b/tests/test_quixstreams/test_checkpointing.py
@@ -0,0 +1,328 @@
+import contextlib
+from typing import Optional
+from unittest.mock import patch, MagicMock
+
+import pytest
+from confluent_kafka import TopicPartition
+
+from quixstreams.checkpointing import Checkpoint, InvalidStoredOffset
+from quixstreams.kafka import Consumer
+from quixstreams.rowproducer import RowProducer
+from quixstreams.state import StateStoreManager
+from quixstreams.state.exceptions import StoreNotRegisteredError, StoreTransactionFailed
+from quixstreams.state.rocksdb import RocksDBPartitionTransaction
+
+
+@pytest.fixture()
+def checkpoint_factory(state_manager, consumer, row_producer):
+ def factory(
+ commit_interval: float = 1,
+ consumer_: Optional[Consumer] = None,
+ producer_: Optional[RowProducer] = None,
+ state_manager_: Optional[StateStoreManager] = None,
+ ):
+ return Checkpoint(
+ commit_interval=commit_interval,
+ producer=producer_ or row_producer,
+ consumer=consumer_ or consumer,
+ state_manager=state_manager_ or state_manager,
+ )
+
+ return factory
+
+
+class TestCheckpoint:
+ def test_empty_true(self, checkpoint_factory):
+ checkpoint = checkpoint_factory()
+ assert checkpoint.empty()
+
+ def test_empty_false(self, checkpoint_factory):
+ checkpoint = checkpoint_factory()
+ checkpoint.store_offset("topic", 0, 0)
+ assert not checkpoint.empty()
+
+ @pytest.mark.parametrize("commit_interval, expired", [(0, True), (999, False)])
+ def test_expired(self, commit_interval, expired, checkpoint_factory):
+ checkpoint = checkpoint_factory(commit_interval=commit_interval)
+ assert checkpoint.expired() == expired
+
+ def test_store_already_processed_offset_fails(self, checkpoint_factory):
+ checkpoint = checkpoint_factory()
+ checkpoint.store_offset("topic", 0, 10)
+ with pytest.raises(InvalidStoredOffset):
+ checkpoint.store_offset("topic", 0, 9)
+
+ def test_commit_no_state_success(
+ self, checkpoint_factory, consumer, state_manager, topic_factory
+ ):
+ topic_name, _ = topic_factory()
+ checkpoint = checkpoint_factory(
+ consumer_=consumer, state_manager_=state_manager
+ )
+ processed_offset = 999
+ # Store the processed offset to simulate processing
+ checkpoint.store_offset(topic_name, 0, processed_offset)
+
+ checkpoint.commit()
+ tp, *_ = consumer.committed([TopicPartition(topic=topic_name, partition=0)])
+ assert tp.offset == processed_offset + 1
+
+ def test_commit_with_state_no_changelog_success(
+ self, checkpoint_factory, consumer, state_manager_factory, topic_factory
+ ):
+ topic_name, _ = topic_factory()
+ producer_mock = MagicMock(spec_set=RowProducer)
+ state_manager = state_manager_factory(producer=producer_mock)
+ checkpoint = checkpoint_factory(
+ consumer_=consumer, state_manager_=state_manager, producer_=producer_mock
+ )
+ processed_offset = 999
+ key, value, prefix = "key", "value", b"__key__"
+ state_manager.register_store(topic_name, "default")
+ store = state_manager.get_store(topic_name, "default")
+ store_partition = store.assign_partition(0)
+
+ # Do some state updates and store the processed offset to simulate processing
+ tx = checkpoint.get_store_transaction(topic_name, 0)
+ tx.set(key=key, value=value, prefix=prefix)
+ checkpoint.store_offset(topic_name, 0, processed_offset)
+
+ # Commit the checkpoint
+ checkpoint.commit()
+
+ # Check the offset is committed
+ tp, *_ = consumer.committed([TopicPartition(topic=topic_name, partition=0)])
+ assert tp.offset == processed_offset + 1
+
+ # Check the producer is flushed
+ assert producer_mock.flush.call_count == 1
+
+ # Check the state is flushed
+ assert tx.completed
+ new_tx = store.start_partition_transaction(0)
+ assert new_tx.get(key=key, prefix=prefix) == value
+
+ # No changelogs should be flushed
+ assert not store_partition.get_changelog_offset()
+ # Processed offset should be stored
+ assert store_partition.get_processed_offset() == processed_offset
+
+ def test_commit_with_state_with_changelog_success(
+ self,
+ checkpoint_factory,
+ row_producer,
+ consumer,
+ state_manager_factory,
+ recovery_manager_factory,
+ topic_factory,
+ ):
+ topic_name, _ = topic_factory()
+ recovery_manager = recovery_manager_factory(consumer=consumer)
+ state_manager = state_manager_factory(
+ producer=row_producer, recovery_manager=recovery_manager
+ )
+ checkpoint = checkpoint_factory(
+ consumer_=consumer, state_manager_=state_manager, producer_=row_producer
+ )
+ processed_offset = 999
+ value, prefix = "value", b"__key__"
+ state_manager.register_store(topic_name, "default")
+ store = state_manager.get_store(topic_name, "default")
+ store_partition = store.assign_partition(0)
+
+ # Do a couple of state updates to send more messages to the changelog
+ tx = checkpoint.get_store_transaction(topic_name, 0)
+ tx.set(key="key1", value=value, prefix=prefix)
+ tx.set(key="key2", value=value, prefix=prefix)
+ checkpoint.store_offset(topic_name, 0, processed_offset)
+
+ # Commit the checkpoint
+ checkpoint.commit()
+
+ # Check the state is flushed
+ assert tx.completed
+
+ # Check the changelog offset
+ # The changelog offset must be equal to a number of updated keys
+ assert store_partition.get_changelog_offset() == 2
+ assert store_partition.get_processed_offset() == 999
+
+ def test_commit_with_state_and_changelog_no_updates_success(
+ self,
+ checkpoint_factory,
+ row_producer,
+ consumer,
+ state_manager_factory,
+ recovery_manager_factory,
+ topic_factory,
+ ):
+ topic_name, _ = topic_factory()
+ recovery_manager = recovery_manager_factory(consumer=consumer)
+ state_manager = state_manager_factory(
+ producer=row_producer, recovery_manager=recovery_manager
+ )
+ checkpoint = checkpoint_factory(
+ consumer_=consumer, state_manager_=state_manager, producer_=row_producer
+ )
+ processed_offset = 999
+ value, prefix = "value", b"__key__"
+ state_manager.register_store(topic_name, "default")
+ store = state_manager.get_store(topic_name, "default")
+ store_partition = store.assign_partition(0)
+
+ # Create a transaction but don't update any keys
+ tx = checkpoint.get_store_transaction(topic_name, 0)
+ checkpoint.store_offset(topic_name, 0, processed_offset)
+
+ # Commit the checkpoint
+ checkpoint.commit()
+
+ # Check the transaction is not flushed
+ assert tx.completed
+
+ # The changelog and processed offsets should be empty because no updates
+ # happend during the transaction
+ assert not store_partition.get_changelog_offset()
+ assert not store_partition.get_processed_offset()
+
+ def test_commit_no_offsets_stored_noop(
+ self, checkpoint_factory, state_manager_factory, topic_factory
+ ):
+ topic_name, _ = topic_factory()
+ producer_mock = MagicMock(spec_set=RowProducer)
+ consumer_mock = MagicMock(spec_set=Consumer)
+ state_manager = state_manager_factory(producer=producer_mock)
+ checkpoint = checkpoint_factory(
+ consumer_=consumer_mock,
+ state_manager_=state_manager,
+ producer_=producer_mock,
+ )
+ # Commit the checkpoint without processing any messages
+ checkpoint.commit()
+
+ # Check nothing is committed
+ assert not consumer_mock.commit.call_count
+ assert not producer_mock.flush.call_count
+
+ def test_commit_has_failed_transactions_fails(
+ self, checkpoint_factory, state_manager_factory, topic_factory
+ ):
+ producer_mock = MagicMock(spec_set=RowProducer)
+ consumer_mock = MagicMock(spec_set=Consumer)
+ state_manager = state_manager_factory(producer=producer_mock)
+ checkpoint = checkpoint_factory(
+ consumer_=consumer_mock,
+ state_manager_=state_manager,
+ producer_=producer_mock,
+ )
+ processed_offset = 999
+ key, value, prefix = "key", "value", b"__key__"
+ state_manager.register_store("topic", "default")
+ store = state_manager.get_store("topic", "default")
+ store.assign_partition(0)
+
+ # Simulate a failed transaction
+ tx = checkpoint.get_store_transaction("topic", 0)
+ with contextlib.suppress(ValueError), patch.object(
+ RocksDBPartitionTransaction,
+ "_serialize_key",
+ side_effect=ValueError("test"),
+ ):
+ tx.set(key=key, value=value, prefix=prefix)
+ assert tx.failed
+
+ # Store offset to simulate processing
+ checkpoint.store_offset("topic", 0, processed_offset)
+
+ # Checkpoint commit should fail if any of the transaction is failed
+ # but the original exception was swallowed by an error callback
+ with pytest.raises(StoreTransactionFailed):
+ checkpoint.commit()
+
+ # The producer should not flush
+ assert not producer_mock.flush.call_count
+ # Consumer should not commit
+ assert not consumer_mock.commit.call_count
+
+ def test_commit_producer_flush_fails(
+ self, checkpoint_factory, state_manager_factory, topic_factory
+ ):
+ producer_mock = MagicMock(spec_set=RowProducer)
+ consumer_mock = MagicMock(spec_set=Consumer)
+ state_manager = state_manager_factory(producer=producer_mock)
+ checkpoint = checkpoint_factory(
+ consumer_=consumer_mock,
+ state_manager_=state_manager,
+ producer_=producer_mock,
+ )
+ processed_offset = 999
+ key, value, prefix = "key", "value", b"__key__"
+ state_manager.register_store("topic", "default")
+ store = state_manager.get_store("topic", "default")
+ store.assign_partition(0)
+
+ # Do some state updates and store the processed offset to simulate processing
+ tx = checkpoint.get_store_transaction("topic", 0)
+ tx.set(key=key, value=value, prefix=prefix)
+ checkpoint.store_offset("topic", 0, processed_offset)
+
+ producer_mock.flush.side_effect = ValueError("Flush failure")
+ # Checkpoint commit should fail if producer failed to flush
+ with pytest.raises(ValueError):
+ checkpoint.commit()
+
+ # Consumer should not commit
+ assert not consumer_mock.commit.call_count
+ # The transaction should remain prepared, but not completed
+ assert tx.prepared
+ assert not tx.completed
+
+ def test_commit_consumer_commit_fails(
+ self, checkpoint_factory, state_manager_factory, topic_factory
+ ):
+ producer_mock = MagicMock(spec_set=RowProducer)
+ consumer_mock = MagicMock(spec_set=Consumer)
+ state_manager = state_manager_factory(producer=producer_mock)
+ checkpoint = checkpoint_factory(
+ consumer_=consumer_mock,
+ state_manager_=state_manager,
+ producer_=producer_mock,
+ )
+ processed_offset = 999
+ key, value, prefix = "key", "value", b"__key__"
+ state_manager.register_store("topic", "default")
+ store = state_manager.get_store("topic", "default")
+ store.assign_partition(0)
+
+ # Do some state updates and store the processed offset to simulate processing
+ tx = checkpoint.get_store_transaction("topic", 0)
+ tx.set(key=key, value=value, prefix=prefix)
+ checkpoint.store_offset("topic", 0, processed_offset)
+
+ consumer_mock.commit.side_effect = ValueError("Commit failure")
+ # Checkpoint commit should fail if consumer failed to commit
+ with pytest.raises(ValueError):
+ checkpoint.commit()
+
+ # Producer should flush
+ assert producer_mock.flush.call_count
+ # The transaction should remain prepared, but not completed
+ assert tx.prepared
+ assert not tx.completed
+
+ def test_get_store_transaction_store_not_registered_fails(self, checkpoint_factory):
+ checkpoint = checkpoint_factory()
+ with pytest.raises(StoreNotRegisteredError):
+ with checkpoint.get_store_transaction("topic", 0, "default"):
+ ...
+
+ def test_get_store_transaction_success(self, checkpoint_factory, state_manager):
+ state_manager.register_store("topic", "default")
+ store = state_manager.get_store("topic", "default")
+ store.assign_partition(0)
+
+ checkpoint = checkpoint_factory(state_manager_=state_manager)
+ tx = checkpoint.get_store_transaction("topic", 0, "default")
+ assert tx
+ tx2 = checkpoint.get_store_transaction("topic", 0, "default")
+ assert tx2 is tx
diff --git a/tests/test_quixstreams/test_dataframe/fixtures.py b/tests/test_quixstreams/test_dataframe/fixtures.py
index ac61f0c83..3d8c7f706 100644
--- a/tests/test_quixstreams/test_dataframe/fixtures.py
+++ b/tests/test_quixstreams/test_dataframe/fixtures.py
@@ -5,6 +5,9 @@
from quixstreams.dataframe.dataframe import StreamingDataFrame
from quixstreams.models.topics import Topic
+from quixstreams.processing_context import ProcessingContext
+from quixstreams.rowconsumer import RowConsumer
+from quixstreams.rowproducer import RowProducer
from quixstreams.state import StateStoreManager
@@ -13,10 +16,20 @@ def dataframe_factory(topic_manager_topic_factory):
def factory(
topic: Optional[Topic] = None,
state_manager: Optional[StateStoreManager] = None,
+ producer: Optional[RowProducer] = None,
) -> StreamingDataFrame:
- return StreamingDataFrame(
- topic=topic or topic_manager_topic_factory("test"),
- state_manager=state_manager or MagicMock(spec=StateStoreManager),
+ producer = producer if producer is not None else MagicMock(spec_set=RowProducer)
+ state_manager = state_manager or MagicMock(spec=StateStoreManager)
+ topic = topic or topic_manager_topic_factory("test")
+
+ processing_ctx = ProcessingContext(
+ producer=producer,
+ consumer=MagicMock(spec_set=RowConsumer),
+ commit_interval=0,
+ state_manager=state_manager,
)
+ processing_ctx.init_checkpoint()
+
+ return StreamingDataFrame(topic=topic, processing_context=processing_ctx)
return factory
diff --git a/tests/test_quixstreams/test_dataframe/test_dataframe.py b/tests/test_quixstreams/test_dataframe/test_dataframe.py
index a7174cedd..8ba57a724 100644
--- a/tests/test_quixstreams/test_dataframe/test_dataframe.py
+++ b/tests/test_quixstreams/test_dataframe/test_dataframe.py
@@ -7,8 +7,7 @@
from quixstreams.core.stream import Filtered
from quixstreams.dataframe.exceptions import InvalidOperation
from quixstreams.dataframe.windows import WindowResult
-from quixstreams.models import MessageTimestamp, Topic
-from tests.utils import TopicPartitionStub
+from quixstreams.models import MessageTimestamp
class TestStreamingDataFrame:
@@ -291,8 +290,7 @@ def test_to_topic(
)
producer = row_producer_factory()
- sdf = dataframe_factory()
- sdf.producer = producer
+ sdf = dataframe_factory(producer=producer)
sdf = sdf.to_topic(topic)
value = {"x": 1, "y": 2}
@@ -331,8 +329,7 @@ def test_to_topic_apply_expand(
)
producer = row_producer_factory()
- sdf = dataframe_factory()
- sdf.producer = producer
+ sdf = dataframe_factory(producer=producer)
sdf = sdf.apply(lambda v: [v, v], expand=True).to_topic(topic)
@@ -377,8 +374,7 @@ def test_to_topic_custom_key(
)
producer = row_producer_factory()
- sdf = dataframe_factory()
- sdf.producer = producer
+ sdf = dataframe_factory(producer=producer)
# Use value["x"] as a new key
sdf = sdf.to_topic(topic, key=lambda v: v["x"])
@@ -412,17 +408,14 @@ def test_to_topic_multiple_topics_out(
topic_manager_topic_factory,
):
topic_0 = topic_manager_topic_factory(
- value_serializer="json",
- value_deserializer="json",
+ value_serializer="json", value_deserializer="json"
)
topic_1 = topic_manager_topic_factory(
- value_serializer="json",
- value_deserializer="json",
+ value_serializer="json", value_deserializer="json"
)
producer = row_producer_factory()
- sdf = dataframe_factory()
- sdf.producer = producer
+ sdf = dataframe_factory(producer=producer)
sdf = sdf.to_topic(topic_0).to_topic(topic_1)
@@ -453,29 +446,6 @@ def test_to_topic_multiple_topics_out(
assert consumed_row.key == ctx.key
assert consumed_row.value == value
- def test_to_topic_no_producer_assigned(
- self, dataframe_factory, topic_manager_topic_factory
- ):
- topic = topic_manager_topic_factory()
-
- sdf = dataframe_factory()
- sdf = sdf.to_topic(topic)
-
- value = {"x": "1", "y": "2"}
- ctx = MessageContext(
- key=b"test",
- topic="test",
- partition=0,
- offset=0,
- size=0,
- timestamp=MessageTimestamp.create(0, 0),
- )
-
- with pytest.raises(
- RuntimeError, match="Producer instance has not been provided"
- ):
- sdf.test(value, ctx=ctx)
-
class TestStreamingDataframeStateful:
def test_apply_stateful(
@@ -496,7 +466,7 @@ def stateful_func(value_: dict, state: State) -> int:
sdf = sdf.apply(stateful_func, stateful=True)
state_manager.on_partition_assign(
- tp=TopicPartitionStub(topic=topic.name, partition=0)
+ topic=topic.name, partition=0, committed_offset=-1001
)
values = [
{"number": 1},
@@ -513,10 +483,7 @@ def stateful_func(value_: dict, state: State) -> int:
timestamp=MessageTimestamp.create(0, 0),
)
for value in values:
- with state_manager.start_store_transaction(
- topic=ctx.topic, partition=ctx.partition, offset=ctx.offset
- ):
- result = sdf.test(value, ctx)
+ result = sdf.test(value, ctx)
assert result == 10
@@ -538,7 +505,7 @@ def stateful_func(value_: dict, state: State):
sdf = sdf.update(stateful_func, stateful=True)
state_manager.on_partition_assign(
- tp=TopicPartitionStub(topic=topic.name, partition=0)
+ topic=topic.name, partition=0, committed_offset=-1001
)
result = None
values = [
@@ -555,10 +522,7 @@ def stateful_func(value_: dict, state: State):
timestamp=MessageTimestamp.create(0, 0),
)
for value in values:
- with state_manager.start_store_transaction(
- topic=ctx.topic, partition=ctx.partition, offset=ctx.offset
- ):
- result = sdf.test(value, ctx)
+ result = sdf.test(value, ctx)
assert result is not None
assert result["max"] == 10
@@ -582,7 +546,7 @@ def stateful_func(value_: dict, state: State):
sdf = sdf.filter(lambda v, state: state.get("max") >= 3, stateful=True)
state_manager.on_partition_assign(
- tp=TopicPartitionStub(topic=topic.name, partition=0)
+ topic=topic.name, partition=0, committed_offset=-1001
)
values = [
{"number": 1},
@@ -599,13 +563,10 @@ def stateful_func(value_: dict, state: State):
)
results = []
for value in values:
- with state_manager.start_store_transaction(
- topic=ctx.topic, partition=ctx.partition, offset=ctx.offset
- ):
- try:
- results.append(sdf.test(value, ctx))
- except Filtered:
- pass
+ try:
+ results.append(sdf.test(value, ctx))
+ except Filtered:
+ pass
assert len(results) == 1
assert results[0]["max"] == 3
@@ -628,7 +589,7 @@ def stateful_func(value_: dict, state: State):
sdf = sdf[sdf.apply(lambda v, state: state.get("max") >= 3, stateful=True)]
state_manager.on_partition_assign(
- tp=TopicPartitionStub(topic=topic.name, partition=0)
+ topic=topic.name, partition=0, committed_offset=-1001
)
values = [
{"number": 1},
@@ -645,13 +606,10 @@ def stateful_func(value_: dict, state: State):
)
results = []
for value in values:
- with state_manager.start_store_transaction(
- topic=ctx.topic, partition=ctx.partition, offset=ctx.offset
- ):
- try:
- results.append(sdf.test(value, ctx))
- except Filtered:
- pass
+ try:
+ results.append(sdf.test(value, ctx))
+ except Filtered:
+ pass
assert len(results) == 1
assert results[0]["max"] == 3
@@ -711,7 +669,7 @@ def test_tumbling_window_current(
)
state_manager.on_partition_assign(
- tp=TopicPartitionStub(topic=topic.name, partition=0)
+ topic=topic.name, partition=0, committed_offset=-1001
)
messages = [
# Message early in the window
@@ -724,10 +682,7 @@ def test_tumbling_window_current(
results = []
for value, ctx in messages:
- with state_manager.start_store_transaction(
- topic=ctx.topic, partition=ctx.partition, offset=ctx.offset
- ):
- results += sdf.test(value=value, ctx=ctx)
+ results += sdf.test(value=value, ctx=ctx)
assert len(results) == 3
assert results == [
WindowResult(value=1, start=0, end=10000),
@@ -754,7 +709,7 @@ def test_tumbling_window_current_out_of_order_late(
sdf = sdf.tumbling_window(duration_ms=10, grace_ms=0).sum().current()
state_manager.on_partition_assign(
- tp=TopicPartitionStub(topic=topic.name, partition=0)
+ topic=topic.name, partition=0, committed_offset=-1001
)
messages = [
# Create window [0, 10)
@@ -768,11 +723,8 @@ def test_tumbling_window_current_out_of_order_late(
results = []
for value, ctx in messages:
- with state_manager.start_store_transaction(
- topic=ctx.topic, partition=ctx.partition, offset=ctx.offset
- ):
- result = sdf.test(value=value, ctx=ctx)
- results += result
+ result = sdf.test(value=value, ctx=ctx)
+ results += result
assert len(results) == 2
assert results == [
@@ -795,7 +747,7 @@ def test_tumbling_window_final(
sdf = sdf.tumbling_window(duration_ms=10, grace_ms=0).sum().final()
state_manager.on_partition_assign(
- tp=TopicPartitionStub(topic=topic.name, partition=0)
+ topic=topic.name, partition=0, committed_offset=-1001
)
messages = [
# Create window [0, 10)
@@ -812,11 +764,7 @@ def test_tumbling_window_final(
results = []
for value, ctx in messages:
- with state_manager.start_store_transaction(
- topic=ctx.topic, partition=ctx.partition, offset=ctx.offset
- ):
- result = sdf.test(value=value, ctx=ctx)
- results += result
+ results += sdf.test(value=value, ctx=ctx)
assert len(results) == 2
assert results == [
@@ -837,7 +785,7 @@ def test_tumbling_window_none_key_messages(
sdf = sdf.tumbling_window(duration_ms=10).sum().current()
state_manager.on_partition_assign(
- tp=TopicPartitionStub(topic=topic.name, partition=0)
+ topic=topic.name, partition=0, committed_offset=-1001
)
messages = [
# Create window [0,10)
@@ -850,10 +798,7 @@ def test_tumbling_window_none_key_messages(
results = []
for value, ctx in messages:
- with state_manager.start_store_transaction(
- topic=ctx.topic, partition=ctx.partition, offset=ctx.offset
- ):
- results += sdf.test(value=value, ctx=ctx)
+ results += sdf.test(value=value, ctx=ctx)
assert len(results) == 2
# Ensure that the windows are returned with correct values and order
@@ -944,7 +889,7 @@ def test_hopping_window_current(
sdf = sdf.hopping_window(duration_ms=10, step_ms=5).sum().current()
state_manager.on_partition_assign(
- tp=TopicPartitionStub(topic=topic.name, partition=0)
+ topic=topic.name, partition=0, committed_offset=-1001
)
messages = [
# Create window [0,10)
@@ -961,10 +906,7 @@ def test_hopping_window_current(
results = []
for value, ctx in messages:
- with state_manager.start_store_transaction(
- topic=ctx.topic, partition=ctx.partition, offset=ctx.offset
- ):
- results += sdf.test(value=value, ctx=ctx)
+ results += sdf.test(value=value, ctx=ctx)
assert len(results) == 9
# Ensure that the windows are returned with correct values and order
@@ -993,7 +935,7 @@ def test_hopping_window_current_out_of_order_late(
sdf = sdf.hopping_window(duration_ms=10, step_ms=5).sum().current()
state_manager.on_partition_assign(
- tp=TopicPartitionStub(topic=topic.name, partition=0)
+ topic=topic.name, partition=0, committed_offset=-1001
)
messages = [
# Create window [0,10)
@@ -1008,10 +950,7 @@ def test_hopping_window_current_out_of_order_late(
results = []
for value, ctx in messages:
- with state_manager.start_store_transaction(
- topic=ctx.topic, partition=ctx.partition, offset=ctx.offset
- ):
- results += sdf.test(value=value, ctx=ctx)
+ results += sdf.test(value=value, ctx=ctx)
assert len(results) == 5
# Ensure that the windows are returned with correct values and order
@@ -1036,7 +975,7 @@ def test_hopping_window_final(
sdf = sdf.hopping_window(duration_ms=10, step_ms=5).sum().final()
state_manager.on_partition_assign(
- tp=TopicPartitionStub(topic=topic.name, partition=0)
+ topic=topic.name, partition=0, committed_offset=-1001
)
messages = [
# Create window [0,10)
@@ -1053,11 +992,9 @@ def test_hopping_window_final(
]
results = []
+
for value, ctx in messages:
- with state_manager.start_store_transaction(
- topic=ctx.topic, partition=ctx.partition, offset=ctx.offset
- ):
- results += sdf.test(value=value, ctx=ctx)
+ results += sdf.test(value=value, ctx=ctx)
assert len(results) == 3
# Ensure that the windows are returned with correct values and order
@@ -1080,7 +1017,7 @@ def test_hopping_window_none_key_messages(
sdf = sdf.hopping_window(duration_ms=10, step_ms=5).sum().current()
state_manager.on_partition_assign(
- tp=TopicPartitionStub(topic=topic.name, partition=0)
+ topic=topic.name, partition=0, committed_offset=-1001
)
messages = [
# Create window [0,10)
@@ -1093,10 +1030,7 @@ def test_hopping_window_none_key_messages(
results = []
for value, ctx in messages:
- with state_manager.start_store_transaction(
- topic=ctx.topic, partition=ctx.partition, offset=ctx.offset
- ):
- results += sdf.test(value=value, ctx=ctx)
+ results += sdf.test(value=value, ctx=ctx)
assert len(results) == 2
# Ensure that the windows are returned with correct values and order
diff --git a/tests/test_quixstreams/test_dataframe/test_windows/test_hopping.py b/tests/test_quixstreams/test_dataframe/test_windows/test_hopping.py
index 93d5accf2..b72ab6fba 100644
--- a/tests/test_quixstreams/test_dataframe/test_windows/test_hopping.py
+++ b/tests/test_quixstreams/test_dataframe/test_windows/test_hopping.py
@@ -55,11 +55,11 @@ def test_hoppingwindow_count(
store = state_manager.get_store(topic="test", store_name=window.name)
store.assign_partition(0)
with store.start_partition_transaction(0) as tx:
- with tx.with_prefix(prefix=b"key"):
- window.process_window(value=2, state=tx.state, timestamp_ms=100)
- updated, expired = window.process_window(
- value=1, state=tx.state, timestamp_ms=100
- )
+ state = tx.as_state(prefix=b"key")
+ window.process_window(value=2, state=state, timestamp_ms=100)
+ updated, expired = window.process_window(
+ value=1, state=state, timestamp_ms=100
+ )
assert len(updated) == 2
assert updated[0]["value"] == 2
assert updated[0]["start"] == 95
@@ -77,11 +77,11 @@ def test_hoppingwindow_sum(self, hopping_window_definition_factory, state_manage
store = state_manager.get_store(topic="test", store_name=window.name)
store.assign_partition(0)
with store.start_partition_transaction(0) as tx:
- with tx.with_prefix(prefix=b"key"):
- window.process_window(value=2, state=tx.state, timestamp_ms=100)
- updated, expired = window.process_window(
- value=1, state=tx.state, timestamp_ms=100
- )
+ state = tx.as_state(prefix=b"key")
+ window.process_window(value=2, state=state, timestamp_ms=100)
+ updated, expired = window.process_window(
+ value=1, state=state, timestamp_ms=100
+ )
assert len(updated) == 2
assert updated[0]["value"] == 3
assert updated[0]["start"] == 95
@@ -99,11 +99,11 @@ def test_hoppingwindow_mean(self, hopping_window_definition_factory, state_manag
store = state_manager.get_store(topic="test", store_name=window.name)
store.assign_partition(0)
with store.start_partition_transaction(0) as tx:
- with tx.with_prefix(prefix=b"key"):
- window.process_window(value=2, state=tx.state, timestamp_ms=100)
- updated, expired = window.process_window(
- value=1, state=tx.state, timestamp_ms=100
- )
+ state = tx.as_state(prefix=b"key")
+ window.process_window(value=2, state=state, timestamp_ms=100)
+ updated, expired = window.process_window(
+ value=1, state=state, timestamp_ms=100
+ )
assert len(updated) == 2
assert updated[0]["value"] == 1.5
assert updated[0]["start"] == 95
@@ -126,10 +126,10 @@ def test_hoppingwindow_reduce(
store = state_manager.get_store(topic="test", store_name=window.name)
store.assign_partition(0)
with store.start_partition_transaction(0) as tx:
- with tx.with_prefix(prefix=b"key"):
- updated, expired = window.process_window(
- value=1, state=tx.state, timestamp_ms=100
- )
+ state = tx.as_state(prefix=b"key")
+ updated, expired = window.process_window(
+ value=1, state=state, timestamp_ms=100
+ )
assert len(updated) == 2
assert updated[0]["value"] == [1]
assert updated[0]["start"] == 95
@@ -147,10 +147,10 @@ def test_hoppingwindow_max(self, hopping_window_definition_factory, state_manage
store = state_manager.get_store(topic="test", store_name=window.name)
store.assign_partition(0)
with store.start_partition_transaction(0) as tx:
- with tx.with_prefix(prefix=b"key"):
- updated, expired = window.process_window(
- value=1, state=tx.state, timestamp_ms=100
- )
+ state = tx.as_state(prefix=b"key")
+ updated, expired = window.process_window(
+ value=1, state=state, timestamp_ms=100
+ )
assert len(updated) == 2
assert updated[0]["value"] == 1
assert updated[0]["start"] == 95
@@ -168,10 +168,10 @@ def test_hoppingwindow_min(self, hopping_window_definition_factory, state_manage
store = state_manager.get_store(topic="test", store_name=window.name)
store.assign_partition(0)
with store.start_partition_transaction(0) as tx:
- with tx.with_prefix(prefix=b"key"):
- updated, expired = window.process_window(
- value=1, state=tx.state, timestamp_ms=100
- )
+ state = tx.as_state(prefix=b"key")
+ updated, expired = window.process_window(
+ value=1, state=state, timestamp_ms=100
+ )
assert len(updated) == 2
assert updated[0]["value"] == 1
assert updated[0]["start"] == 95
@@ -218,31 +218,29 @@ def test_hopping_window_process_window_expired(
store = state_manager.get_store(topic="test", store_name=window.name)
store.assign_partition(0)
with store.start_partition_transaction(0) as tx:
- with tx.with_prefix(prefix=b"key"):
- # Add item to the windows [95, 105) and [100, 110)
- updated, expired = window.process_window(
- value=1, state=tx.state, timestamp_ms=100
- )
- assert len(updated) == 2
- assert updated[0]["value"] == 1
- assert updated[0]["start"] == 95
- assert updated[0]["end"] == 105
- assert updated[1]["value"] == 1
- assert updated[1]["start"] == 100
- assert updated[1]["end"] == 110
-
- assert not expired
-
- # Now add item to the windows [105, 115) and [110, 120)
- # The windows [95, 105) and [100, 110) are now expired
- # and should be returned
- _, expired = window.process_window(
- value=2, state=tx.state, timestamp_ms=110
- )
- assert len(expired) == 2
- assert expired[0]["value"] == 1
- assert expired[0]["start"] == 95
- assert expired[0]["end"] == 105
- assert expired[1]["value"] == 1
- assert expired[1]["start"] == 100
- assert expired[1]["end"] == 110
+ state = tx.as_state(prefix=b"key")
+ # Add item to the windows [95, 105) and [100, 110)
+ updated, expired = window.process_window(
+ value=1, state=state, timestamp_ms=100
+ )
+ assert len(updated) == 2
+ assert updated[0]["value"] == 1
+ assert updated[0]["start"] == 95
+ assert updated[0]["end"] == 105
+ assert updated[1]["value"] == 1
+ assert updated[1]["start"] == 100
+ assert updated[1]["end"] == 110
+
+ assert not expired
+
+ # Now add item to the windows [105, 115) and [110, 120)
+ # The windows [95, 105) and [100, 110) are now expired
+ # and should be returned
+ _, expired = window.process_window(value=2, state=state, timestamp_ms=110)
+ assert len(expired) == 2
+ assert expired[0]["value"] == 1
+ assert expired[0]["start"] == 95
+ assert expired[0]["end"] == 105
+ assert expired[1]["value"] == 1
+ assert expired[1]["start"] == 100
+ assert expired[1]["end"] == 110
diff --git a/tests/test_quixstreams/test_dataframe/test_windows/test_tumbling.py b/tests/test_quixstreams/test_dataframe/test_windows/test_tumbling.py
index e255b55cf..3216d9d06 100644
--- a/tests/test_quixstreams/test_dataframe/test_windows/test_tumbling.py
+++ b/tests/test_quixstreams/test_dataframe/test_windows/test_tumbling.py
@@ -51,11 +51,11 @@ def test_tumblingwindow_count(
store = state_manager.get_store(topic="test", store_name=window.name)
store.assign_partition(0)
with store.start_partition_transaction(0) as tx:
- with tx.with_prefix(prefix=b"key"):
- window.process_window(value=0, state=tx.state, timestamp_ms=100)
- updated, expired = window.process_window(
- value=0, state=tx.state, timestamp_ms=100
- )
+ state = tx.as_state(prefix=b"key")
+ window.process_window(value=0, state=state, timestamp_ms=100)
+ updated, expired = window.process_window(
+ value=0, state=state, timestamp_ms=100
+ )
assert len(updated) == 1
assert updated[0]["value"] == 2
assert not expired
@@ -69,11 +69,11 @@ def test_tumblingwindow_sum(
store = state_manager.get_store(topic="test", store_name=window.name)
store.assign_partition(0)
with store.start_partition_transaction(0) as tx:
- with tx.with_prefix(prefix=b"key"):
- window.process_window(value=2, state=tx.state, timestamp_ms=100)
- updated, expired = window.process_window(
- value=1, state=tx.state, timestamp_ms=100
- )
+ state = tx.as_state(prefix=b"key")
+ window.process_window(value=2, state=state, timestamp_ms=100)
+ updated, expired = window.process_window(
+ value=1, state=state, timestamp_ms=100
+ )
assert len(updated) == 1
assert updated[0]["value"] == 3
assert not expired
@@ -87,11 +87,11 @@ def test_tumblingwindow_mean(
store = state_manager.get_store(topic="test", store_name=window.name)
store.assign_partition(0)
with store.start_partition_transaction(0) as tx:
- with tx.with_prefix(prefix=b"key"):
- window.process_window(value=2, state=tx.state, timestamp_ms=100)
- updated, expired = window.process_window(
- value=1, state=tx.state, timestamp_ms=100
- )
+ state = tx.as_state(prefix=b"key")
+ window.process_window(value=2, state=state, timestamp_ms=100)
+ updated, expired = window.process_window(
+ value=1, state=state, timestamp_ms=100
+ )
assert len(updated) == 1
assert updated[0]["value"] == 1.5
assert not expired
@@ -108,11 +108,11 @@ def test_tumblingwindow_reduce(
store = state_manager.get_store(topic="test", store_name=window.name)
store.assign_partition(0)
with store.start_partition_transaction(0) as tx:
- with tx.with_prefix(prefix=b"key"):
- window.process_window(value=2, state=tx.state, timestamp_ms=100)
- updated, expired = window.process_window(
- value=1, state=tx.state, timestamp_ms=100
- )
+ state = tx.as_state(prefix=b"key")
+ window.process_window(value=2, state=state, timestamp_ms=100)
+ updated, expired = window.process_window(
+ value=1, state=state, timestamp_ms=100
+ )
assert len(updated) == 1
assert updated[0]["value"] == [2, 1]
assert not expired
@@ -126,11 +126,11 @@ def test_tumblingwindow_max(
store = state_manager.get_store(topic="test", store_name=window.name)
store.assign_partition(0)
with store.start_partition_transaction(0) as tx:
- with tx.with_prefix(prefix=b"key"):
- window.process_window(value=2, state=tx.state, timestamp_ms=100)
- updated, expired = window.process_window(
- value=1, state=tx.state, timestamp_ms=100
- )
+ state = tx.as_state(prefix=b"key")
+ window.process_window(value=2, state=state, timestamp_ms=100)
+ updated, expired = window.process_window(
+ value=1, state=state, timestamp_ms=100
+ )
assert len(updated) == 1
assert updated[0]["value"] == 2
assert not expired
@@ -144,11 +144,11 @@ def test_tumblingwindow_min(
store = state_manager.get_store(topic="test", store_name=window.name)
store.assign_partition(0)
with store.start_partition_transaction(0) as tx:
- with tx.with_prefix(prefix=b"key"):
- window.process_window(value=2, state=tx.state, timestamp_ms=100)
- updated, expired = window.process_window(
- value=1, state=tx.state, timestamp_ms=100
- )
+ state = tx.as_state(prefix=b"key")
+ window.process_window(value=2, state=state, timestamp_ms=100)
+ updated, expired = window.process_window(
+ value=1, state=state, timestamp_ms=100
+ )
assert len(updated) == 1
assert updated[0]["value"] == 1
assert not expired
@@ -182,28 +182,28 @@ def test_tumbling_window_process_window_expired(
store = state_manager.get_store(topic="test", store_name=window.name)
store.assign_partition(0)
with store.start_partition_transaction(0) as tx:
- with tx.with_prefix(prefix=b"key"):
- # Add item to the window [100, 110)
- updated, expired = window.process_window(
- value=1, state=tx.state, timestamp_ms=100
- )
- assert len(updated) == 1
- assert updated[0]["value"] == 1
- assert updated[0]["start"] == 100
- assert updated[0]["end"] == 110
- assert not expired
-
- # Now add item to the window [110, 120)
- # The window [100, 110) is now expired and should be returned
- updated, expired = window.process_window(
- value=2, state=tx.state, timestamp_ms=110
- )
- assert len(updated) == 1
- assert updated[0]["value"] == 2
- assert updated[0]["start"] == 110
- assert updated[0]["end"] == 120
-
- assert len(expired) == 1
- assert expired[0]["value"] == 1
- assert expired[0]["start"] == 100
- assert expired[0]["end"] == 110
+ state = tx.as_state(prefix=b"key")
+ # Add item to the window [100, 110)
+ updated, expired = window.process_window(
+ value=1, state=state, timestamp_ms=100
+ )
+ assert len(updated) == 1
+ assert updated[0]["value"] == 1
+ assert updated[0]["start"] == 100
+ assert updated[0]["end"] == 110
+ assert not expired
+
+ # Now add item to the window [110, 120)
+ # The window [100, 110) is now expired and should be returned
+ updated, expired = window.process_window(
+ value=2, state=state, timestamp_ms=110
+ )
+ assert len(updated) == 1
+ assert updated[0]["value"] == 2
+ assert updated[0]["start"] == 110
+ assert updated[0]["end"] == 120
+
+ assert len(expired) == 1
+ assert expired[0]["value"] == 1
+ assert expired[0]["start"] == 100
+ assert expired[0]["end"] == 110
diff --git a/tests/test_quixstreams/test_kafka/test_producer.py b/tests/test_quixstreams/test_kafka/test_producer.py
index 0d6b1f403..470495f12 100644
--- a/tests/test_quixstreams/test_kafka/test_producer.py
+++ b/tests/test_quixstreams/test_kafka/test_producer.py
@@ -16,6 +16,19 @@ def test_produce(self, producer, topic_factory):
)
producer.poll(1.0)
+ def test_produce_on_delivery_callback(self, producer, topic_factory):
+ topic_name, _ = topic_factory()
+
+ offsets = []
+ with producer:
+ producer.produce(
+ topic=topic_name,
+ key="test",
+ value=b"test",
+ on_delivery=lambda error, msg: offsets.append(msg.offset()),
+ )
+ assert len(offsets) == 1
+
def test_produce_failure_no_error(self, producer_factory, topic_factory):
topic_name, _ = topic_factory()
extra_config = {
diff --git a/tests/test_quixstreams/test_models/fixtures.py b/tests/test_quixstreams/test_models/fixtures.py
index 332849ff0..f0e7c7f30 100644
--- a/tests/test_quixstreams/test_models/fixtures.py
+++ b/tests/test_quixstreams/test_models/fixtures.py
@@ -5,7 +5,7 @@
import pytest
-from ..utils import ConfluentKafkaMessageStub
+from tests.utils import ConfluentKafkaMessageStub
@pytest.fixture()
diff --git a/tests/test_quixstreams/test_models/test_topics/test_topics.py b/tests/test_quixstreams/test_models/test_topics/test_topics.py
index 3ec4e8121..aa63e37ca 100644
--- a/tests/test_quixstreams/test_models/test_topics/test_topics.py
+++ b/tests/test_quixstreams/test_models/test_topics/test_topics.py
@@ -26,8 +26,8 @@
SERIALIZERS,
DESERIALIZERS,
)
+from tests.utils import ConfluentKafkaMessageStub
from ..utils import int_to_bytes, float_to_bytes
-from ...utils import ConfluentKafkaMessageStub
class JSONListDeserializer(JSONDeserializer):
diff --git a/tests/test_quixstreams/test_rowconsumer.py b/tests/test_quixstreams/test_rowconsumer.py
index a61fe4045..834050778 100644
--- a/tests/test_quixstreams/test_rowconsumer.py
+++ b/tests/test_quixstreams/test_rowconsumer.py
@@ -7,7 +7,7 @@
IgnoreMessage,
SerializationError,
)
-from quixstreams.rowconsumer import KafkaMessageError
+from quixstreams.kafka.exceptions import KafkaConsumerException
from tests.utils import Timeout
@@ -61,7 +61,7 @@ def test_poll_row_kafka_error(
auto_offset_reset="earliest",
) as consumer:
consumer.subscribe([topic])
- with pytest.raises(KafkaMessageError) as raised:
+ with pytest.raises(KafkaConsumerException) as raised:
consumer.poll_row(10.0)
exc = raised.value
assert exc.code == KafkaError.UNKNOWN_TOPIC_OR_PART
@@ -112,7 +112,7 @@ def test_poll_row_kafka_error_raise(
producer.produce(topic.name, key=b"key", value=b"value")
producer.flush()
consumer.subscribe([topic])
- with pytest.raises(KafkaMessageError):
+ with pytest.raises(KafkaConsumerException):
consumer.poll_row(10.0)
def test_poll_row_deserialization_error_suppress(
@@ -147,7 +147,7 @@ def test_poll_row_kafka_error_suppress(
suppressed = False
def on_error(exc, *args):
- assert isinstance(exc, KafkaMessageError)
+ assert isinstance(exc, KafkaConsumerException)
nonlocal suppressed
suppressed = True
return True
diff --git a/tests/test_quixstreams/test_rowproducer.py b/tests/test_quixstreams/test_rowproducer.py
index 1a06cdfb6..ab0e03e27 100644
--- a/tests/test_quixstreams/test_rowproducer.py
+++ b/tests/test_quixstreams/test_rowproducer.py
@@ -1,7 +1,8 @@
from concurrent.futures import Future
import pytest
-from confluent_kafka import KafkaException
+from confluent_kafka import KafkaException as ConfluentKafkaException
+from quixstreams.kafka.exceptions import KafkaProducerDeliveryError
from quixstreams.models import (
JSONSerializer,
@@ -17,15 +18,12 @@ def test_produce_row_success(
topic_json_serdes_factory,
row_factory,
):
- topic = topic_json_serdes_factory()
-
+ topic = topic_json_serdes_factory(num_partitions=1)
key = b"key"
value = {"field": "value"}
headers = [("header1", b"1")]
- with row_consumer_factory(
- auto_offset_reset="earliest"
- ) as consumer, row_producer_factory() as producer:
+ with row_producer_factory() as producer:
row = row_factory(
topic=topic.name,
value=value,
@@ -33,8 +31,14 @@ def test_produce_row_success(
headers=headers,
)
producer.produce_row(topic=topic, row=row)
+
+ with row_consumer_factory(auto_offset_reset="earliest") as consumer:
consumer.subscribe([topic])
row = consumer.poll_row(timeout=5.0)
+
+ assert producer.offsets
+ assert producer.offsets.get((topic.name, 0)) is not None
+
assert row
assert row.key == key
assert row.value == value
@@ -97,7 +101,7 @@ def test_produce_row_produce_error_raise(
topic=topic.name,
value={"field": 1001 * "a"},
)
- with pytest.raises(KafkaException):
+ with pytest.raises(ConfluentKafkaException):
producer.produce_row(topic=topic, row=row)
def test_produce_row_serialization_error_suppress(
@@ -122,3 +126,37 @@ def on_error(exc, *args):
value=object(),
)
producer.produce_row(topic=topic, row=row)
+
+ def test_produce_delivery_error_raised_on_produce(
+ self, row_producer_factory, topic_json_serdes_factory
+ ):
+ topic = topic_json_serdes_factory(num_partitions=1)
+ key = b"key"
+ value = b"value"
+
+ producer = row_producer_factory()
+
+ # Send message to a non-existing partition to simulate error
+ # in the delivery callback
+ producer.produce(topic=topic.name, key=key, value=value, partition=3)
+ # Poll for delivery callbacks
+ producer.poll(5)
+ # The next produce should fail after
+ with pytest.raises(KafkaProducerDeliveryError):
+ producer.produce(topic=topic.name, key=key, value=value)
+
+ def test_produce_delivery_error_raised_on_flush(
+ self, row_producer_factory, topic_json_serdes_factory
+ ):
+ topic = topic_json_serdes_factory(num_partitions=1)
+ key = b"key"
+ value = b"value"
+
+ producer = row_producer_factory()
+
+ # Send message to a non-existing partition to simulate error
+ # in the delivery callback
+ producer.produce(topic=topic.name, key=key, value=value, partition=3)
+ # The flush should fail after that
+ with pytest.raises(KafkaProducerDeliveryError):
+ producer.flush()
diff --git a/tests/test_quixstreams/test_state/fixtures.py b/tests/test_quixstreams/test_state/fixtures.py
index 9d2302c64..20184a187 100644
--- a/tests/test_quixstreams/test_state/fixtures.py
+++ b/tests/test_quixstreams/test_state/fixtures.py
@@ -1,26 +1,26 @@
-import pytest
import uuid
-
from typing import Optional
-from unittest.mock import create_autospec
+from unittest.mock import MagicMock
+
+import pytest
from quixstreams.kafka import Consumer
+from quixstreams.models import TopicManager
from quixstreams.state.recovery import RecoveryPartition, RecoveryManager
from quixstreams.state.types import StorePartition
@pytest.fixture()
-def recovery_partition_store_mock():
- store = create_autospec(StorePartition)()
- store.get_changelog_offset.return_value = 15
- recovery_partition = RecoveryPartition(
- changelog_name=f"changelog__{str(uuid.uuid4())}",
- partition_num=0,
- store_partition=store,
- )
- recovery_partition._changelog_lowwater = 10
- recovery_partition._changelog_highwater = 20
- return recovery_partition
+def recovery_manager_factory(topic_manager_factory):
+ def factory(
+ topic_manager: Optional[TopicManager] = None,
+ consumer: Optional[Consumer] = None,
+ ) -> RecoveryManager:
+ topic_manager = topic_manager or topic_manager_factory()
+ consumer = consumer or MagicMock(Consumer)
+ return RecoveryManager(topic_manager=topic_manager, consumer=consumer)
+
+ return factory
@pytest.fixture()
@@ -28,33 +28,20 @@ def recovery_partition_factory():
"""Mocks a StorePartition if none provided"""
def factory(
- changelog_name: str = str(uuid.uuid4()),
+ changelog_name: str = "",
partition_num: int = 0,
- mocked_changelog_offset: Optional[int] = 15,
- lowwater: Optional[int] = None,
- highwater: Optional[int] = None,
store_partition: Optional[StorePartition] = None,
+ committed_offset: int = -1001,
):
+ changelog_name = changelog_name or f"changelog__{str(uuid.uuid4())}"
if not store_partition:
- store_partition = create_autospec(StorePartition)()
- store_partition.get_changelog_offset.return_value = mocked_changelog_offset
+ store_partition = MagicMock(spec_set=StorePartition)
recovery_partition = RecoveryPartition(
changelog_name=changelog_name,
partition_num=partition_num,
store_partition=store_partition,
+ committed_offset=committed_offset,
)
- if lowwater:
- recovery_partition._changelog_lowwater = lowwater
- if highwater:
- recovery_partition._changelog_highwater = highwater
return recovery_partition
return factory
-
-
-@pytest.fixture()
-def recovery_manager_mock_consumer(topic_manager_factory):
- return RecoveryManager(
- consumer=create_autospec(Consumer)("broker", "group", "latest"),
- topic_manager=topic_manager_factory(),
- )
diff --git a/tests/test_quixstreams/test_state/test_manager.py b/tests/test_quixstreams/test_state/test_manager.py
index 9942c6f1f..1043171f3 100644
--- a/tests/test_quixstreams/test_state/test_manager.py
+++ b/tests/test_quixstreams/test_state/test_manager.py
@@ -1,18 +1,15 @@
-import contextlib
import os
import uuid
-from unittest.mock import patch, call
+from unittest.mock import MagicMock
import pytest
-import rocksdict
+from quixstreams.kafka import Consumer
from quixstreams.state.exceptions import (
StoreNotRegisteredError,
- InvalidStoreTransactionStateError,
PartitionStoreIsUsed,
WindowedStoreAlreadyRegisteredError,
)
-from quixstreams.state.recovery import ChangelogProducerFactory
from tests.utils import TopicPartitionStub
@@ -43,11 +40,11 @@ def test_init_state_dir_exists_not_a_dir_fails(
...
def test_rebalance_partitions_stores_not_registered(self, state_manager):
- tp = TopicPartitionStub("topic", 0)
# It's ok to rebalance partitions when there are no stores registered
- state_manager.on_partition_assign(tp)
- state_manager.on_partition_revoke(tp)
- state_manager.on_partition_lost(tp)
+ state_manager.on_partition_assign(
+ topic="topic", partition=0, committed_offset=-1001
+ )
+ state_manager.on_partition_revoke(topic="topic", partition=0)
def test_register_store(self, state_manager):
state_manager = state_manager
@@ -69,7 +66,11 @@ def test_assign_revoke_partitions_stores_registered(self, state_manager):
store_partitions = []
for tp in partitions:
- store_partitions.extend(state_manager.on_partition_assign(tp))
+ store_partitions.extend(
+ state_manager.on_partition_assign(
+ topic=tp.topic, partition=tp.partition, committed_offset=-1001
+ )
+ )
assert len(store_partitions) == 3
assert len(state_manager.get_store("topic1", "store1").partitions) == 1
@@ -77,33 +78,7 @@ def test_assign_revoke_partitions_stores_registered(self, state_manager):
assert len(state_manager.get_store("topic2", "store1").partitions) == 1
for tp in partitions:
- state_manager.on_partition_revoke(tp)
-
- assert not state_manager.get_store("topic1", "store1").partitions
- assert not state_manager.get_store("topic1", "store2").partitions
- assert not state_manager.get_store("topic2", "store1").partitions
-
- def test_assign_lose_partitions_stores_registered(self, state_manager):
- state_manager.register_store("topic1", store_name="store1")
- state_manager.register_store("topic1", store_name="store2")
- state_manager.register_store("topic2", store_name="store1")
-
- stores_list = [s for d in state_manager.stores.values() for s in d.values()]
- assert len(stores_list) == 3
-
- partitions = [
- TopicPartitionStub("topic1", 0),
- TopicPartitionStub("topic2", 0),
- ]
-
- for tp in partitions:
- state_manager.on_partition_assign(tp)
- assert len(state_manager.get_store("topic1", "store1").partitions) == 1
- assert len(state_manager.get_store("topic1", "store2").partitions) == 1
- assert len(state_manager.get_store("topic2", "store1").partitions) == 1
-
- for tp in partitions:
- state_manager.on_partition_lost(tp)
+ state_manager.on_partition_revoke(topic=tp.topic, partition=tp.partition)
assert not state_manager.get_store("topic1", "store1").partitions
assert not state_manager.get_store("topic1", "store2").partitions
@@ -141,7 +116,9 @@ def test_clear_stores(self, state_manager):
# Assign partitions
for tp in partitions:
- state_manager.on_partition_assign(tp)
+ state_manager.on_partition_assign(
+ topic=tp.topic, partition=tp.partition, committed_offset=-1001
+ )
# Collect paths of stores to be deleted
stores_to_delete = [
@@ -153,7 +130,7 @@ def test_clear_stores(self, state_manager):
# Revoke partitions
for tp in partitions:
- state_manager.on_partition_revoke(tp)
+ state_manager.on_partition_revoke(topic=tp.topic, partition=tp.partition)
# Act - Delete stores
state_manager.clear_stores()
@@ -166,235 +143,76 @@ def test_clear_stores_fails(self, state_manager):
# Register stores
state_manager.register_store("topic1", store_name="store1")
- # Define the partition
- partition = TopicPartitionStub("topic1", 0)
-
# Assign the partition
- state_manager.on_partition_assign(partition)
+ state_manager.on_partition_assign(
+ topic="topic1", partition=0, committed_offset=-1001
+ )
# Act - Delete stores
with pytest.raises(PartitionStoreIsUsed):
state_manager.clear_stores()
- def test_store_transaction_success(self, state_manager):
- state_manager.register_store("topic", "store")
- tp = TopicPartitionStub("topic", 0)
- state_manager.on_partition_assign(tp)
-
- store = state_manager.get_store("topic", "store")
- store_partition = store.partitions[0]
-
- assert store_partition.get_processed_offset() is None
-
- with state_manager.start_store_transaction("topic", partition=0, offset=1):
- tx = state_manager.get_store_transaction("store")
- tx.set("some_key", "some_value")
-
- state_manager.on_partition_assign(tp)
-
- store = state_manager.get_store("topic", "store")
- store_partition = store.partitions[0]
-
- assert store_partition.get_processed_offset() == 1
-
- def test_store_transaction_no_flush_on_exception(self, state_manager):
- state_manager.register_store("topic", "store")
- state_manager.on_partition_assign(TopicPartitionStub("topic", 0))
- store = state_manager.get_store("topic", "store")
-
- with contextlib.suppress(Exception):
- with state_manager.start_store_transaction("topic", partition=0, offset=1):
- tx = state_manager.get_store_transaction("store")
- tx.set("some_key", "some_value")
- raise ValueError()
- store_partition = store.partitions[0]
- assert store_partition.get_processed_offset() is None
-
- def test_store_transaction_no_flush_if_partition_transaction_failed(
- self, state_manager
+class TestStateStoreManagerWithRecovery:
+ def test_rebalance_partitions_stores_not_registered(
+ self, state_manager_factory, recovery_manager_factory
):
- """
- Ensure that no PartitionTransactions are flushed to the DB if
- any of them fails
- """
- state_manager.register_store("topic", "store1")
- state_manager.register_store("topic", "store2")
- state_manager.on_partition_assign(TopicPartitionStub("topic", 0))
- store1 = state_manager.get_store("topic", "store1")
- store2 = state_manager.get_store("topic", "store2")
-
- with state_manager.start_store_transaction("topic", partition=0, offset=1):
- tx_store1 = state_manager.get_store_transaction("store1")
- tx_store2 = state_manager.get_store_transaction("store2")
- # Simulate exception in one of the transactions
- with contextlib.suppress(ValueError), patch.object(
- rocksdict.WriteBatch, "put", side_effect=ValueError("test")
- ):
- tx_store1.set("some_key", "some_value")
- tx_store2.set("some_key", "some_value")
-
- assert store1.partitions[0].get_processed_offset() is None
- assert store2.partitions[0].get_processed_offset() is None
-
- def test_get_store_transaction_store_not_registered_fails(self, state_manager):
- with pytest.raises(StoreNotRegisteredError):
- with state_manager.start_store_transaction("topic", 0, 0):
- ...
-
- def test_get_store_transaction_not_started(self, state_manager):
- with pytest.raises(InvalidStoreTransactionStateError):
- state_manager.get_store_transaction("store")
-
- def test_start_store_transaction_already_started(self, state_manager):
- state_manager.register_store("topic", "store")
- with state_manager.start_store_transaction("topic", partition=0, offset=0):
- with pytest.raises(InvalidStoreTransactionStateError):
- with state_manager.start_store_transaction(
- "topic", partition=0, offset=0
- ):
- ...
-
-
-class TestStateStoreManagerChangelog:
- def test_rebalance_partitions_stores_not_registered(self, state_manager_changelogs):
- state_manager = state_manager_changelogs
- tp = TopicPartitionStub("topic", 0)
+ state_manager = state_manager_factory(
+ recovery_manager=recovery_manager_factory()
+ )
# It's ok to rebalance partitions when there are no stores registered
- state_manager.on_partition_assign(tp)
- state_manager.on_partition_revoke(tp)
- state_manager.on_partition_lost(tp)
+ state_manager.on_partition_assign(
+ topic="topic", partition=0, committed_offset=-1001
+ )
+ state_manager.on_partition_revoke(topic="topic", partition=0)
+
+ def test_register_store(
+ self, state_manager_factory, recovery_manager_factory, topic_manager_factory
+ ):
+ topic_manager = topic_manager_factory()
+ recovery_manager = recovery_manager_factory(topic_manager=topic_manager)
+ state_manager = state_manager_factory(recovery_manager=recovery_manager)
- def test_register_store(self, state_manager_changelogs):
- state_manager = state_manager_changelogs
- topic_manager = state_manager._recovery_manager._topic_manager
+ # Create a topic
topic = topic_manager.topic(name="topic1")
+
+ # Register a store
store_name = "default"
state_manager.register_store(topic.name, store_name=store_name)
- assert store_name in state_manager._stores[topic.name]
+ # Check that the store is registered
+ assert store_name in state_manager.stores[topic.name]
+ # Check that changelog topic is created
assert store_name in topic_manager.changelog_topics[topic.name]
def test_assign_revoke_partitions_stores_registered(
- self,
- state_manager_changelogs,
+ self, state_manager_factory, recovery_manager_factory, topic_manager_factory
):
- state_manager = state_manager_changelogs
- recovery_manager = state_manager._recovery_manager
- topic_manager = recovery_manager._topic_manager
-
- changelog_assign = patch.object(recovery_manager, "assign_partition").start()
- changelog_revoke = patch.object(recovery_manager, "revoke_partition").start()
- topic_manager.topic(name="topic1")
- topic_manager.topic(name="topic2")
- state_manager.register_store("topic1", store_name="store1")
- state_manager.register_store("topic1", store_name="store2")
- state_manager.register_store("topic2", store_name="store1")
-
- stores_list = [s for d in state_manager.stores.values() for s in d.values()]
- assert len(stores_list) == 3
-
- partitions = [
- TopicPartitionStub("topic1", 0),
- TopicPartitionStub("topic2", 0),
- ]
-
- store_partitions = []
- assign_calls = []
- for tp in partitions:
- store_partitions.extend(state_manager.on_partition_assign(tp))
- assign_calls.append(
- call(
- tp.topic,
- tp.partition,
- {
- name: store.partitions[tp.partition]
- for name, store in state_manager._stores[tp.topic].items()
- },
- )
- )
- assert changelog_assign.call_count == len(assign_calls)
- assert len(store_partitions) == 3
-
- for store in stores_list:
- assert len(store.partitions) == 1
- assert isinstance(
- store._changelog_producer_factory, ChangelogProducerFactory
- )
-
- revoke_calls = []
- for tp in partitions:
- state_manager.on_partition_revoke(tp)
- revoke_calls.append(call(tp.partition))
- changelog_revoke.assert_has_calls(revoke_calls)
- assert changelog_revoke.call_count == len(revoke_calls)
-
- for store in stores_list:
- assert not store.partitions
-
- def test_store_transaction_no_flush_on_exception(
- self,
- state_manager_changelogs,
- ):
- state_manager = state_manager_changelogs
- recovery_manager = state_manager._recovery_manager
- topic_manager = recovery_manager._topic_manager
- producer = state_manager._producer
- consumer = recovery_manager._consumer
-
- consumer.get_watermark_offsets.return_value = (0, 10)
- topic_manager.topic(name="topic")
- # topic_admin_mock.inspect_topics.return_value = {"topic": None}
- state_manager.register_store("topic", store_name="store")
- state_manager.on_partition_assign(TopicPartitionStub("topic", 0))
- store = state_manager.get_store("topic", "store")
-
- with contextlib.suppress(Exception):
- with state_manager.start_store_transaction("topic", partition=0, offset=1):
- tx = state_manager.get_store_transaction("store")
- tx.set("some_key", "some_value")
- raise ValueError()
-
- store_partition = store.partitions[0]
- assert store_partition.get_processed_offset() is None
- assert store_partition.get_changelog_offset() is None
- producer.produce.assert_not_called()
-
- def test_store_transaction_no_flush_if_partition_transaction_failed(
- self,
- state_manager_changelogs,
- ):
- """
- Ensure that no PartitionTransactions are flushed to the DB if
- any of them fails
- """
- state_manager = state_manager_changelogs
- recovery_manager = state_manager._recovery_manager
- topic_manager = recovery_manager._topic_manager
- producer = state_manager._producer
- consumer = recovery_manager._consumer
-
+ topic_manager = topic_manager_factory()
+ consumer = MagicMock(spec_set=Consumer)
consumer.get_watermark_offsets.return_value = (0, 10)
- topic_manager.topic(name="topic")
- state_manager.register_store("topic", store_name="store1")
- state_manager.register_store("topic", store_name="store2")
- state_manager.on_partition_assign(TopicPartitionStub("topic", 0))
-
- store1 = state_manager.get_store("topic", "store1")
- store2 = state_manager.get_store("topic", "store2")
-
- with state_manager.start_store_transaction("topic", partition=0, offset=1):
- tx_store1 = state_manager.get_store_transaction("store1")
- tx_store2 = state_manager.get_store_transaction("store2")
- # Simulate exception in one of the transactions
- with contextlib.suppress(ValueError), patch.object(
- rocksdict.WriteBatch, "put", side_effect=ValueError("test")
- ):
- tx_store1.set("some_key", "some_value")
- tx_store2.set("some_key", "some_value")
-
- assert store1.partitions[0].get_processed_offset() is None
- assert store1.partitions[0].get_changelog_offset() is None
- assert store2.partitions[0].get_processed_offset() is None
- assert store2.partitions[0].get_changelog_offset() is None
- producer.produce.assert_not_called()
+ recovery_manager = recovery_manager_factory(
+ topic_manager=topic_manager, consumer=consumer
+ )
+ state_manager = state_manager_factory(recovery_manager=recovery_manager)
+ topic_name = "topic1"
+ partition = 0
+ topic_manager.topic(name=topic_name)
+ store_name = "store1"
+
+ # Register a store
+ state_manager.register_store(topic_name, store_name=store_name)
+
+ # Assign a topic partition
+ state_manager.on_partition_assign(
+ topic=topic_name, partition=partition, committed_offset=-1001
+ )
+
+ # Check that RecoveryManager has a partition assigned
+ assert recovery_manager.partitions
+
+ # Revoke a topic partition
+ state_manager.on_partition_revoke(topic=topic_name, partition=partition)
+
+ # Check that RecoveryManager has a partition revoked too
+ assert not recovery_manager.partitions
diff --git a/tests/test_quixstreams/test_state/test_recovery.py b/tests/test_quixstreams/test_state/test_recovery.py
deleted file mode 100644
index b066ad1d6..000000000
--- a/tests/test_quixstreams/test_state/test_recovery.py
+++ /dev/null
@@ -1,546 +0,0 @@
-import logging
-import uuid
-from unittest.mock import patch
-
-from quixstreams.state.recovery import (
- ChangelogProducer,
- ConfluentPartition,
-)
-from quixstreams.state.recovery import ChangelogProducerFactory
-from ..utils import ConfluentKafkaMessageStub
-
-
-class TestRecoveryPartition:
- def test_set_watermarks(self, recovery_partition_store_mock):
- recovery_partition = recovery_partition_store_mock
- recovery_partition.set_watermarks(50, 100)
- assert recovery_partition._changelog_lowwater == 50
- assert recovery_partition._changelog_highwater == 100
-
- def test_needs_recovery(self, recovery_partition_store_mock):
- recovery_partition = recovery_partition_store_mock
- assert recovery_partition.needs_recovery
-
- def test_needs_recovery_caught_up(self, recovery_partition_store_mock):
- recovery_partition = recovery_partition_store_mock
- recovery_partition.store_partition.get_changelog_offset.return_value = 20
- assert not recovery_partition_store_mock.needs_recovery
-
- def test_needs_recovery_no_valid_offsets(self, recovery_partition_store_mock):
- recovery_partition = recovery_partition_store_mock
- recovery_partition.set_watermarks(100, 100)
- assert not recovery_partition.needs_recovery
- assert recovery_partition.needs_offset_update
-
- def test_recover(self, recovery_partition_store_mock):
- recovery_partition = recovery_partition_store_mock
- msg = ConfluentKafkaMessageStub()
- recovery_partition.recover_from_changelog_message(msg)
- recovery_partition.store_partition.recover_from_changelog_message.assert_called_with(
- changelog_message=msg
- )
-
- def test_update_offset(self, recovery_partition_store_mock):
- recovery_partition = recovery_partition_store_mock
- recovery_partition.update_offset()
- recovery_partition.store_partition.set_changelog_offset.assert_called_with(
- recovery_partition._changelog_highwater - 1
- )
-
- def test_update_offset_warn(self, recovery_partition_store_mock, caplog):
- """
- A warning is thrown if the stored changelog offset is higher than the highwater
- """
- recovery_partition = recovery_partition_store_mock
- recovery_partition.store_partition.get_changelog_offset.return_value = (
- recovery_partition._changelog_highwater + 1
- )
- with caplog.at_level(level=logging.WARNING):
- recovery_partition.update_offset()
- assert caplog.text != ""
-
-
-class TestChangelogProducer:
- def test_produce(
- self, topic_manager_factory, row_producer_factory, consumer_factory
- ):
- p_num = 2
- cf_header = "my_cf_header"
- cf = "my_cf"
- expected = {
- "key": b"my_key",
- "value": b"10",
- "headers": [(cf_header, cf.encode())],
- "partition": p_num,
- }
- topic_manager = topic_manager_factory()
- changelog = topic_manager.topic(
- name=str(uuid.uuid4()),
- key_serializer="bytes",
- value_serializer="bytes",
- config=topic_manager.topic_config(num_partitions=3),
- )
- topic_manager.create_topics([changelog])
-
- writer = ChangelogProducer(
- changelog_name=changelog.name,
- partition_num=p_num,
- producer=row_producer_factory(),
- )
- writer.produce(
- **{k: v for k, v in expected.items() if k in ["key", "value"]},
- headers={cf_header: cf},
- )
- writer._producer.flush(5)
-
- consumer = consumer_factory(auto_offset_reset="earliest")
- consumer.subscribe([changelog.name])
- message = consumer.poll(10)
-
- for k in expected:
- assert getattr(message, k)() == expected[k]
-
-
-class TestChangelogProducerFactory:
- def test_get_partition_producer(self, row_producer_factory):
- changelog_name = "changelog__topic"
- producer = row_producer_factory()
-
- p_num = 1
-
- changelog_producer = ChangelogProducerFactory(
- changelog_name=changelog_name, producer=producer
- ).get_partition_producer(partition_num=p_num)
- assert changelog_producer._changelog_name == changelog_name
- assert changelog_producer._partition_num == p_num
- assert changelog_producer._producer == producer
-
-
-class TestRecoveryManager:
- def test_register_changelog(self, recovery_manager_mock_consumer):
- recovery_manager = recovery_manager_mock_consumer
- topic_manager = recovery_manager._topic_manager
- store_name = "my_store"
- kwargs = dict(
- topic_name="my_topic_name",
- consumer_group="my_group",
- )
- with patch.object(topic_manager, "changelog_topic") as make_changelog:
- recovery_manager.register_changelog(**kwargs, store_name=store_name)
- make_changelog.assert_called_with(**kwargs, store_name=store_name)
-
- def test_assign_partition(self, state_manager_changelogs):
- """
- From two `Store`s `StorePartition`s (partition 1), assign the partition
- ("window") that needs recovery.
-
- No recovery underway yet, so should pause all partitions.
- """
- state_manager = state_manager_changelogs
- recovery_manager = state_manager._recovery_manager
- topic_manager = recovery_manager._topic_manager
- consumer = recovery_manager._consumer
- expected_store_name = "window"
- expected_offset = 15
-
- topic_name = "topic_name"
- topic_manager.topic(topic_name)
- partition_num = 1
- consumer.get_watermark_offsets.side_effect = [(0, 10), (0, 20)]
- consumer.assignment.return_value = "assignments"
-
- # setup state_managers `StorePartitions` (which also sets up changelog topics)
- store_partitions = {}
- for store_name, offset in [
- ("default", 10),
- (expected_store_name, expected_offset),
- ]:
- state_manager.register_store(topic_name=topic_name, store_name=store_name)
- partition = state_manager.get_store(
- topic=topic_name, store_name=store_name
- ).assign_partition(partition_num)
- patch.object(partition, "get_changelog_offset", return_value=offset).start()
- store_partitions[store_name] = partition
-
- recovery_manager.assign_partition(
- topic_name=topic_name,
- partition_num=partition_num,
- store_partitions=store_partitions,
- )
-
- # expected changelog topic's partition was subscribed to
- expected_changelog_name = topic_manager.changelog_topics[topic_name][
- expected_store_name
- ].name
- assign_calls = consumer.incremental_assign.call_args_list[0].args
- assert len(assign_calls) == 1
- partition_list = assign_calls[0]
- assert isinstance(partition_list, list)
- assert len(assign_calls) == 1
- confluent_partition = partition_list[0]
- assert isinstance(confluent_partition, ConfluentPartition)
- assert expected_changelog_name == confluent_partition.topic
- assert partition_num == confluent_partition.partition
- assert expected_offset == confluent_partition.offset
-
- # recovery manager should also store respective RecoveryPartition
- assert recovery_manager._recovery_partitions[partition_num][
- expected_changelog_name
- ]
- assert len(recovery_manager._recovery_partitions[partition_num]) == 1
-
- # should pause ALL partitions
- consumer.pause.assert_called_with("assignments")
-
- def test_assign_partition_fix_offset_only(
- self, recovery_manager_mock_consumer, recovery_partition_factory
- ):
- """
- From two RecoveryPartitions, fix the one ("window") that has a bad offset.
-
- No recovery was previously going, and an offset fix will not trigger one.
- """
- recovery_manager = recovery_manager_mock_consumer
- topic_name = "topic_name"
- partition_num = 1
- store_names = ["default", "window"]
- changelog_names = [f"changelog__{topic_name}__{store}" for store in store_names]
- watermarks = [(0, 10), (0, 20)]
- changelog_offsets = [10, 22]
-
- consumer = recovery_manager._consumer
- consumer.assignment.return_value = "assignments"
-
- recovery_partitions = [
- recovery_partition_factory(
- changelog_name=changelog_names[i],
- partition_num=partition_num,
- mocked_changelog_offset=changelog_offsets[i],
- lowwater=watermarks[i][0],
- highwater=watermarks[i][1],
- )
- for i in range(len(store_names))
- ]
- patch.object(
- recovery_manager,
- "_generate_recovery_partitions",
- return_value=recovery_partitions,
- ).start()
- with patch.object(recovery_partitions[1], "update_offset") as update_offset:
- recovery_manager.assign_partition(
- topic_name=topic_name,
- partition_num=partition_num,
- store_partitions="mocked_out",
- )
-
- # no pause or assignments should be called
- consumer.pause.assert_not_called()
- consumer.incremental_assign.assert_not_called()
- update_offset.assert_called()
-
- def test_assign_partition_fix_offset_during_recovery(
- self, recovery_manager_mock_consumer, recovery_partition_factory
- ):
- """
- From two RecoveryPartitions, fix the one ("window") that has a bad offset.
-
- Recovery was previously going, so must pause the source topic.
- """
- recovery_manager = recovery_manager_mock_consumer
- recovery_manager._running = True
- topic_name = "topic_name"
- partition_num = 1
- store_names = ["default", "window"]
- changelog_names = [f"changelog__{topic_name}__{store}" for store in store_names]
- watermarks = [(0, 10), (0, 20)]
- changelog_offsets = [10, 22]
-
- consumer = recovery_manager._consumer
-
- # already in the middle of recovering
- recovery_manager._recovery_partitions.setdefault(2, {})[
- changelog_offsets[0]
- ] = recovery_partition_factory(
- changelog_name=changelog_names[0],
- partition_num=2,
- )
- assert recovery_manager.recovering
-
- recovery_partitions = [
- recovery_partition_factory(
- changelog_name=changelog_names[i],
- partition_num=partition_num,
- mocked_changelog_offset=changelog_offsets[i],
- lowwater=watermarks[i][0],
- highwater=watermarks[i][1],
- )
- for i in range(len(store_names))
- ]
-
- patch.object(
- recovery_manager,
- "_generate_recovery_partitions",
- return_value=recovery_partitions,
- ).start()
-
- with patch.object(recovery_partitions[1], "update_offset") as update_offset:
- recovery_manager.assign_partition(
- topic_name=topic_name,
- partition_num=partition_num,
- store_partitions="mocked",
- )
-
- pause_call = consumer.pause.call_args_list[0].args
- assert len(pause_call) == 1
- assert isinstance(pause_call[0], list)
- assert len(pause_call[0]) == 1
- assert isinstance(pause_call[0][0], ConfluentPartition)
- assert topic_name == pause_call[0][0].topic
- assert partition_num == pause_call[0][0].partition
-
- consumer.incremental_assign.assert_not_called()
- update_offset.assert_called()
-
- def test_assign_partitions_during_recovery(
- self, recovery_manager_mock_consumer, recovery_partition_factory
- ):
- """
- From two RecoveryPartitions, assign the one ("window") that needs recovery.
-
- RecoveryManager is currently recovering, so should only pause source topic.
- """
- recovery_manager = recovery_manager_mock_consumer
- recovery_manager._running = True
- topic_name = "topic_name"
- partition_num = 1
- store_names = ["default", "window"]
- changelog_names = [f"changelog__{topic_name}__{store}" for store in store_names]
- watermarks = [(0, 10), (0, 20)]
- changelog_offsets = [10, 15]
-
- consumer = recovery_manager._consumer
- consumer.get_watermark_offsets.side_effect = watermarks
-
- # already in the middle of recovering
- recovery_manager._recovery_partitions.setdefault(2, {})[
- changelog_offsets[0]
- ] = recovery_partition_factory(
- changelog_name=changelog_names[0],
- partition_num=2,
- )
- assert recovery_manager.recovering
-
- recovery_partitions = [
- recovery_partition_factory(
- changelog_name=changelog_names[i],
- partition_num=partition_num,
- mocked_changelog_offset=changelog_offsets[i],
- lowwater=watermarks[i][0],
- highwater=watermarks[i][1],
- )
- for i in range(len(store_names))
- ]
- skip_recover_partition = recovery_partitions[0]
- should_recover_partition = recovery_partitions[1]
-
- patch.object(
- recovery_manager,
- "_generate_recovery_partitions",
- return_value=recovery_partitions,
- ).start()
- recovery_manager.assign_partition(
- topic_name=topic_name,
- partition_num=partition_num,
- store_partitions="mocked_out",
- )
-
- # should only pause the source topic partition since currently recovering
- pause_call = consumer.pause.call_args_list[0].args
- assert len(pause_call) == 1
- assert isinstance(pause_call[0], list)
- assert len(pause_call[0]) == 1
- assert isinstance(pause_call[0][0], ConfluentPartition)
- assert topic_name == pause_call[0][0].topic
- assert partition_num == pause_call[0][0].partition
-
- # should only assign the partition that needs recovery
- assign_call = consumer.incremental_assign.call_args_list[0].args
- assert len(assign_call) == 1
- assert isinstance(assign_call[0], list)
- assert len(assign_call[0]) == 1
- assert isinstance(assign_call[0][0], ConfluentPartition)
- assert should_recover_partition.changelog_name == assign_call[0][0].topic
- assert should_recover_partition.partition_num == assign_call[0][0].partition
- assert should_recover_partition.offset == assign_call[0][0].offset
- assert (
- recovery_manager._recovery_partitions[partition_num][
- should_recover_partition.changelog_name
- ]
- == should_recover_partition
- )
- assert (
- skip_recover_partition.changelog_name
- not in recovery_manager._recovery_partitions[partition_num]
- )
-
- def test__revoke_recovery_partition(
- self, recovery_manager_mock_consumer, recovery_partition_factory
- ):
- recovery_manager = recovery_manager_mock_consumer
- consumer = recovery_manager._consumer
- topic_name = "topic_name"
- partition_num = 1
- changelog_names = [
- f"changelog__{topic_name}__{store_name}"
- for store_name in ["default", "window"]
- ]
-
- recovery_manager._recovery_partitions = {
- partition_num: {
- changelog_name: recovery_partition_factory(
- changelog_name=changelog_name,
- partition_num=partition_num,
- )
- for changelog_name in changelog_names
- }
- }
-
- recovery_manager.revoke_partition(partition_num=partition_num)
-
- unassign_call = consumer.incremental_unassign.call_args_list[0].args
- assert len(unassign_call) == 1
- assert isinstance(unassign_call[0], list)
- assert len(unassign_call[0]) == 2
- for idx, confluent_partition in enumerate(unassign_call[0]):
- assert isinstance(confluent_partition, ConfluentPartition)
- assert changelog_names[idx] == confluent_partition.topic
- assert partition_num == confluent_partition.partition
- assert not recovery_manager._recovery_partitions
-
- def test_revoke_partitions(
- self, recovery_manager_mock_consumer, recovery_partition_factory
- ):
- """
- Revoke a topic partition's respective recovery partitions.
- """
- recovery_manager = recovery_manager_mock_consumer
- topic_name = "topic_name"
- partition_num = 1
- changelog_name = f"changelog__{topic_name}__default"
- recovery_partition = (
- recovery_partition_factory(
- changelog_name=changelog_name,
- partition_num=partition_num,
- ),
- )
- recovery_manager._recovery_partitions = {
- partition_num: {changelog_name: recovery_partition}
- }
-
- with patch.object(recovery_manager, "_revoke_recovery_partitions") as revoke:
- recovery_manager.revoke_partition(partition_num=partition_num)
-
- revoke.assert_called_with([recovery_partition], partition_num)
-
- def test_revoke_partition_not_assigned(self, recovery_manager_mock_consumer):
- """
- Skip revoking any recovery partitions for a given partition since none are
- currently assigned (due to not needing recovery).
- """
- recovery_manager = recovery_manager_mock_consumer
- with patch.object(recovery_manager, "_revoke_recovery_partitions") as revoke:
- recovery_manager.revoke_partition(partition_num=1)
-
- revoke.assert_not_called()
-
- def test_do_recovery(
- self, recovery_manager_mock_consumer, recovery_partition_factory
- ):
- recovery_manager = recovery_manager_mock_consumer
- topic_name = "topic_name"
- partition_num = 1
- store_names = ["default", "window"]
- changelog_names = [f"changelog__{topic_name}__{store}" for store in store_names]
- watermarks = [(0, 10), (0, 20)]
- changelog_offsets = [0, 0]
-
- consumer = recovery_manager._consumer
- consumer.assignment.return_value = ["assignments"]
-
- recovery_partitions = [
- recovery_partition_factory(
- changelog_name=changelog_names[i],
- partition_num=partition_num,
- mocked_changelog_offset=changelog_offsets[i],
- lowwater=watermarks[i][0],
- highwater=watermarks[i][1],
- )
- for i in range(len(store_names))
- ]
-
- patch.object(
- recovery_manager,
- "_generate_recovery_partitions",
- return_value=recovery_partitions,
- ).start()
-
- recovery_manager.assign_partition(
- topic_name=topic_name,
- partition_num=partition_num,
- store_partitions="mocked_out",
- )
- with patch.object(recovery_manager, "_recovery_loop") as recovery_loop:
- recovery_manager.do_recovery()
-
- changelog_resume_args = consumer.resume.call_args_list[0].args[0]
- print(changelog_resume_args)
- assert len(changelog_resume_args) == 2
- for idx, tp in enumerate(changelog_resume_args):
- assert recovery_partitions[idx].changelog_name == tp.topic
- assert recovery_partitions[idx].partition_num == tp.partition
- recovery_loop.assert_called()
- assert consumer.resume.call_args_list[1].args[0] == ["assignments"]
- assert consumer.resume.call_count == 2
-
- def test__recovery_loop(
- self, recovery_manager_mock_consumer, recovery_partition_factory
- ):
- """
- Successfully recover from a changelog message, which is also the last one
- for the partition, so revoke it afterward.
- """
- recovery_manager = recovery_manager_mock_consumer
- recovery_manager._running = True
- consumer = recovery_manager._consumer
- topic_name = "topic_name"
- changelog_name = f"changelog__{topic_name}__default"
- highwater = 20
- partition_num = 1
- msg = ConfluentKafkaMessageStub(
- topic=changelog_name, partition=partition_num, offset=highwater - 1
- )
- consumer.poll.return_value = msg
- rp = recovery_partition_factory(
- changelog_name=changelog_name,
- partition_num=partition_num,
- mocked_changelog_offset=highwater, # referenced AFTER recovering from msg
- lowwater=0,
- highwater=highwater,
- )
- recovery_manager._recovery_partitions.setdefault(partition_num, {})[
- changelog_name
- ] = rp
-
- recovery_manager._recovery_loop()
-
- rp.store_partition.recover_from_changelog_message.assert_called_with(
- changelog_message=msg
- )
- consumer.incremental_unassign.assert_called()
-
- def test__recovery_loop_no_partitions(self, recovery_manager_mock_consumer):
- recovery_manager = recovery_manager_mock_consumer
- consumer = recovery_manager._consumer
-
- recovery_manager._recovery_loop()
- consumer.poll.assert_not_called()
diff --git a/tests/test_quixstreams/test_state/test_recovery/__init__.py b/tests/test_quixstreams/test_state/test_recovery/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/test_quixstreams/test_state/test_recovery/test_changelog_producer.py b/tests/test_quixstreams/test_state/test_recovery/test_changelog_producer.py
new file mode 100644
index 000000000..d4c58f3ef
--- /dev/null
+++ b/tests/test_quixstreams/test_state/test_recovery/test_changelog_producer.py
@@ -0,0 +1,60 @@
+import uuid
+
+from quixstreams.state import ChangelogProducer, ChangelogProducerFactory
+
+
+class TestChangelogProducer:
+ def test_produce(
+ self, topic_manager_factory, row_producer_factory, consumer_factory
+ ):
+ p_num = 2
+ cf_header = "my_cf_header"
+ cf = "my_cf"
+ expected = {
+ "key": b"my_key",
+ "value": b"10",
+ "headers": [(cf_header, cf.encode())],
+ "partition": p_num,
+ }
+ topic_manager = topic_manager_factory()
+ changelog = topic_manager.topic(
+ name=str(uuid.uuid4()),
+ key_serializer="bytes",
+ value_serializer="bytes",
+ config=topic_manager.topic_config(num_partitions=3),
+ )
+ source_topic_name = "source-topic"
+ topic_manager.create_topics([changelog])
+
+ producer = ChangelogProducer(
+ changelog_name=changelog.name,
+ partition=p_num,
+ producer=row_producer_factory(),
+ )
+ producer.produce(
+ **{k: v for k, v in expected.items() if k in ["key", "value"]},
+ headers={cf_header: cf},
+ )
+ producer.flush()
+
+ consumer = consumer_factory(auto_offset_reset="earliest")
+ consumer.subscribe([changelog.name])
+ message = consumer.poll(10)
+
+ for k in expected:
+ assert getattr(message, k)() == expected[k]
+
+
+class TestChangelogProducerFactory:
+ def test_get_partition_producer(self, row_producer_factory):
+ changelog_name = "changelog__topic"
+ producer = row_producer_factory()
+
+ p_num = 1
+
+ changelog_producer = ChangelogProducerFactory(
+ changelog_name=changelog_name,
+ producer=producer,
+ ).get_partition_producer(partition_num=p_num)
+ assert changelog_producer.changelog_name == changelog_name
+ assert changelog_producer.partition == p_num
diff --git a/tests/test_quixstreams/test_state/test_recovery/test_recovery_manager.py b/tests/test_quixstreams/test_state/test_recovery/test_recovery_manager.py
new file mode 100644
index 000000000..e27895fa6
--- /dev/null
+++ b/tests/test_quixstreams/test_state/test_recovery/test_recovery_manager.py
@@ -0,0 +1,375 @@
+from unittest.mock import patch, MagicMock
+
+from confluent_kafka import TopicPartition as ConfluentPartition
+
+from quixstreams.kafka import Consumer
+from quixstreams.models import TopicManager, TopicConfig
+from quixstreams.state import RecoveryPartition
+from quixstreams.state.rocksdb import RocksDBStorePartition
+from quixstreams.state.rocksdb.metadata import CHANGELOG_CF_MESSAGE_HEADER
+from tests.utils import ConfluentKafkaMessageStub
+
+
+class TestRecoveryManager:
+ def test_register_changelog(self, recovery_manager_factory):
+ recovery_manager = recovery_manager_factory()
+
+ store_name = "my_store"
+ kwargs = dict(
+ topic_name="my_topic_name",
+ consumer_group="my_group",
+ )
+ with patch.object(TopicManager, "changelog_topic") as make_changelog:
+ recovery_manager.register_changelog(**kwargs, store_name=store_name)
+
+ make_changelog.assert_called_with(**kwargs, store_name=store_name)
+
+ def test_assign_partition(
+ self, state_manager_factory, recovery_manager_factory, topic_manager_factory
+ ):
+ """
+ Check that RecoveryManager.assign_partition() assigns proper changelog topic
+ partition and pauses the consumer.
+ """
+
+ store_name = "default"
+ # Stored changelog offset is between lowwater and highwater, so the
+ # given store partition needs to be recovered.
+ lowwater, highwater = 0, 20
+ stored_changelog_offset = 15
+
+ topic_name = "topic_name"
+ partition_num = 0
+
+ consumer = MagicMock(spec_set=Consumer)
+ topic_manager = topic_manager_factory()
+ recovery_manager = recovery_manager_factory(
+ consumer=consumer, topic_manager=topic_manager
+ )
+ state_manager = state_manager_factory(recovery_manager=recovery_manager)
+
+ # Create a topic
+ topic_manager.topic(topic_name)
+ # Mock the topic watermarks
+ consumer.get_watermark_offsets.side_effect = [(lowwater, highwater)]
+ # Mock the current assignment with some values
+ assignment = [1, 2, 3]
+ consumer.assignment.return_value = assignment
+
+ # Create Store and assign a StorePartition (which also sets up changelog topics)
+ store_partitions = {}
+ state_manager.register_store(topic_name=topic_name, store_name=store_name)
+ store = state_manager.get_store(topic=topic_name, store_name=store_name)
+ partition = store.assign_partition(partition_num)
+ store_partitions[store_name] = partition
+
+ # Assign a RecoveryPartition
+ with patch.object(
+ RocksDBStorePartition,
+ "get_changelog_offset",
+ return_value=stored_changelog_offset,
+ ):
+ recovery_manager.assign_partition(
+ topic=topic_name,
+ partition=partition_num,
+ store_partitions=store_partitions,
+ committed_offset=-1001,
+ )
+
+ # Check the changelog topic partition is assigned to the consumer
+ assert consumer.incremental_assign.call_count == 1
+ assigned_changelog_partitions = consumer.incremental_assign.call_args[0][0]
+ assert len(assigned_changelog_partitions) == 1
+
+ # Check the changelog topic partition properties
+ changelog_partition = assigned_changelog_partitions[0]
+ changelog_topic_name = topic_manager.changelog_topics[topic_name][
+ store_name
+ ].name
+ assert changelog_partition.topic == changelog_topic_name
+ assert changelog_partition.partition == partition_num
+ assert changelog_partition.offset == stored_changelog_offset
+
+ # Check that RecoveryPartition is assigned to RecoveryManager
+ assert len(recovery_manager.partitions[partition_num]) == 1
+
+ # Check that consumer paused all assigned partitions
+ consumer.pause.assert_called_with(assignment)
+
+ def test_assign_partition_fix_offset_only(
+ self,
+ recovery_manager_factory,
+ recovery_partition_factory,
+ topic_manager_factory,
+ ):
+ """
+ Try to recover store partition with changelog offset AHEAD of the watermark.
+ The offset should be adjusted in this case, but recovery should not be triggered
+ """
+
+ topic_name = "topic_name"
+ partition_num = 0
+ store_name = "default"
+ consumer_group = "group"
+ lowwater, highwater = 0, 20
+
+ # Register a source topic and a changelog topic with one partition
+ topic_manager = topic_manager_factory()
+ topic_manager.topic(topic_name)
+ topic_manager.changelog_topic(
+ topic_name=topic_name, store_name=store_name, consumer_group=consumer_group
+ )
+
+ # Mock Consumer
+ consumer = MagicMock(spec_set=Consumer)
+ consumer.get_watermark_offsets.return_value = (lowwater, highwater)
+ consumer.assignment.return_value = "assignments"
+
+ # Mock StorePartition
+ changelog_offset = 22
+ store_partition = MagicMock(spec_set=RocksDBStorePartition)
+ store_partition.get_changelog_offset.return_value = changelog_offset
+
+ recovery_manager = recovery_manager_factory(
+ consumer=consumer, topic_manager=topic_manager
+ )
+
+ with patch.object(RecoveryPartition, "update_offset") as update_offset:
+ recovery_manager.assign_partition(
+ topic=topic_name,
+ partition=partition_num,
+ store_partitions={store_name: store_partition},
+ committed_offset=-1001,
+ )
+
+ # "update_offset()" should be called
+ update_offset.assert_called()
+
+ # No pause or assignments should happen
+ consumer.pause.assert_not_called()
+ consumer.incremental_assign.assert_not_called()
+
+ def test_assign_partitions_during_recovery(
+ self,
+ recovery_manager_factory,
+ recovery_partition_factory,
+ topic_manager_factory,
+ ):
+ """
+ Check that RecoveryManager pauses only the source topic partition if
+ another partition is already recovering.
+ """
+
+ topic_name = "topic_name"
+ consumer_group = "group"
+ store_name = "default"
+ changelog_name = f"changelog__{consumer_group}--{topic_name}--{store_name}"
+ changelog_offset = 5
+ lowwater, highwater = 0, 10
+ assignment = [0, 1]
+
+ # Register a source topic and a changelog topic with 2 partitions
+ topic_manager = topic_manager_factory()
+ topic_manager.topic(
+ topic_name, config=TopicConfig(num_partitions=2, replication_factor=1)
+ )
+ topic_manager.changelog_topic(
+ topic_name=topic_name, store_name=store_name, consumer_group=consumer_group
+ )
+
+ # Create a RecoveryManager
+ consumer = MagicMock(spec_set=Consumer)
+ consumer.assignment.return_value = assignment
+ recovery_manager = recovery_manager_factory(
+ consumer=consumer, topic_manager=topic_manager
+ )
+
+ # Assign first partition that needs recovery
+ store_partition = MagicMock(spec_set=RocksDBStorePartition)
+ consumer.get_watermark_offsets.return_value = (lowwater, highwater)
+ store_partition.get_changelog_offset.return_value = changelog_offset
+ recovery_manager.assign_partition(
+ topic=topic_name,
+ partition=0,
+ committed_offset=-1001,
+ store_partitions={store_name: store_partition},
+ )
+ assert recovery_manager.partitions
+ assert recovery_manager.partitions[0][changelog_name].needs_recovery
+
+ # Put a RecoveryManager into "recovering" state
+ recovery_manager._running = True
+ assert recovery_manager.recovering
+
+ # Assign second partition that also needs recovery
+ store_partition = MagicMock(spec_set=RocksDBStorePartition)
+ store_partition.get_changelog_offset.return_value = 5
+ recovery_manager.assign_partition(
+ topic=topic_name,
+ partition=1,
+ committed_offset=-1001,
+ store_partitions={store_name: store_partition},
+ )
+ assert recovery_manager.partitions
+ assert recovery_manager.partitions[1][changelog_name].needs_recovery
+
+ # Check that consumer first paused all partitions
+ assert consumer.pause.call_args_list[0].args[0] == assignment
+
+ # Check that consumer paused only source topic partition when the second
+ # recovery partition was assigned
+ assert consumer.pause.call_args_list[1].args[0] == [
+ ConfluentPartition(
+ topic=topic_name,
+ partition=1,
+ offset=-1001,
+ )
+ ]
+
+ def test_revoke_partition(self, recovery_manager_factory, topic_manager_factory):
+ """
+ Revoke a topic partition's respective recovery partitions.
+ """
+ topic_name = "topic_name"
+ consumer_group = "group"
+ store_name = "default"
+ changelog_offset = 5
+ lowwater, highwater = 0, 10
+ assignment = [0, 1]
+ changelog_name = f"changelog__{consumer_group}--{topic_name}--{store_name}"
+
+ # Register a source topic and a changelog topic with two partitions
+ topic_manager = topic_manager_factory()
+ topic_manager.topic(
+ topic_name, config=TopicConfig(num_partitions=2, replication_factor=1)
+ )
+ topic_manager.changelog_topic(
+ topic_name=topic_name, store_name=store_name, consumer_group=consumer_group
+ )
+
+ # Create a RecoveryManager
+ consumer = MagicMock(spec_set=Consumer)
+ consumer.assignment.return_value = assignment
+ recovery_manager = recovery_manager_factory(
+ consumer=consumer, topic_manager=topic_manager
+ )
+
+ # Assign partitions that need recovery
+ store_partition = MagicMock(spec_set=RocksDBStorePartition)
+ consumer.get_watermark_offsets.return_value = (lowwater, highwater)
+ store_partition.get_changelog_offset.return_value = changelog_offset
+ recovery_manager.assign_partition(
+ topic=topic_name,
+ partition=0,
+ committed_offset=-1001,
+ store_partitions={store_name: store_partition},
+ )
+ recovery_manager.assign_partition(
+ topic=topic_name,
+ partition=1,
+ committed_offset=-1001,
+ store_partitions={store_name: store_partition},
+ )
+ assert len(recovery_manager.partitions) == 2
+
+ # Revoke one partition
+ recovery_manager.revoke_partition(0)
+ assert len(recovery_manager.partitions) == 1
+ # Check that consumer unassigned the changelog topic partition as well
+ assert consumer.incremental_unassign.call_args.args[0] == [
+ ConfluentPartition(topic=changelog_name, partition=0)
+ ]
+
+ # Revoke second partition
+ recovery_manager.revoke_partition(1)
+ # Check that consumer unassigned the changelog topic partition as well
+ assert consumer.incremental_unassign.call_args.args[0] == [
+ ConfluentPartition(topic=changelog_name, partition=1)
+ ]
+ # Check that no partitions are assigned
+ assert not recovery_manager.partitions
+
+ def test_revoke_partition_no_partitions_assigned(self, recovery_manager_factory):
+ """
+ Skip revoking any recovery partitions for a given partition since none are
+ currently assigned (due to not needing recovery).
+ """
+ consumer = MagicMock(spec_set=Consumer)
+ recovery_manager = recovery_manager_factory(consumer=consumer)
+ recovery_manager.revoke_partition(partition_num=0)
+ assert not consumer.incremental_unassign.call_count
+
+ def test_do_recovery(
+ self, recovery_manager_factory, topic_manager_factory, rocksdb_partition
+ ):
+ """
+ Test that RecoveryManager.do_recovery():
+ - resumes the recovering changelog partition
+ - applies changes to the StorePartition
+ - revokes the RecoveryPartition after recovery is done
+ - unassigns the changelog partition
+ - unpauses source topic partitions
+ """
+ topic_name = "topic_name"
+ consumer_group = "group"
+ store_name = "default"
+ lowwater, highwater = 0, 10
+ assignment = [0, 1]
+ changelog_name = f"changelog__{consumer_group}--{topic_name}--{store_name}"
+
+ changelog_message = ConfluentKafkaMessageStub(
+ topic=changelog_name,
+ partition=0,
+ offset=highwater - 1,
+ key=b"key",
+ value=b"value",
+ headers=[(CHANGELOG_CF_MESSAGE_HEADER, b"default")],
+ )
+
+ # Register a source topic and a changelog topic with one partition
+ topic_manager = topic_manager_factory()
+ topic_manager.topic(topic_name)
+ topic_manager.changelog_topic(
+ topic_name=topic_name, store_name=store_name, consumer_group=consumer_group
+ )
+
+ # Create a RecoveryManager
+ consumer = MagicMock(spec_set=Consumer)
+ consumer.poll.return_value = changelog_message
+ consumer.assignment.return_value = assignment
+ recovery_manager = recovery_manager_factory(
+ consumer=consumer, topic_manager=topic_manager
+ )
+
+ # Assign a partition that needs recovery
+ consumer.get_watermark_offsets.return_value = (lowwater, highwater)
+ recovery_manager.assign_partition(
+ topic=topic_name,
+ partition=0,
+ committed_offset=-1001,
+ store_partitions={store_name: rocksdb_partition},
+ )
+
+ # Trigger a recovery
+ recovery_manager.do_recovery()
+
+ # Check that consumer first resumed the changelog topic partition
+ consumer_resume_calls = consumer.resume.call_args_list
+ assert consumer_resume_calls[0].args[0] == [
+ ConfluentPartition(topic=changelog_name, partition=0)
+ ]
+ # Check that consumer resumed all assigned partitions after recovery is done
+ assert consumer_resume_calls[1].args[0] == assignment
+
+ # Check that RecoveryPartitions are unassigned
+ assert not recovery_manager.partitions
+
+ def test_do_recovery_no_partitions_assigned(self, recovery_manager_factory):
+ # Create a RecoveryManager
+ consumer = MagicMock(spec_set=Consumer)
+ recovery_manager = recovery_manager_factory(consumer=consumer)
+ # Trigger a recovery
+ recovery_manager.do_recovery()
+
+ # Check that consumer.poll() is not called
+ assert not consumer.poll.called
diff --git a/tests/test_quixstreams/test_state/test_recovery/test_recovery_partition.py b/tests/test_quixstreams/test_state/test_recovery/test_recovery_partition.py
new file mode 100644
index 000000000..d2c1b67dc
--- /dev/null
+++ b/tests/test_quixstreams/test_state/test_recovery/test_recovery_partition.py
@@ -0,0 +1,68 @@
+import logging
+from unittest.mock import MagicMock
+
+from quixstreams.state.rocksdb import RocksDBStorePartition
+from tests.utils import ConfluentKafkaMessageStub
+
+
+class TestRecoveryPartition:
+ def test_set_watermarks(self, recovery_partition_factory):
+ recovery_partition = recovery_partition_factory()
+ recovery_partition.set_watermarks(50, 100)
+ assert recovery_partition.changelog_lowwater == 50
+ assert recovery_partition.changelog_highwater == 100
+
+ def test_needs_recovery(self, recovery_partition_factory):
+ store_partition = MagicMock(RocksDBStorePartition)
+ store_partition.get_changelog_offset.return_value = 10
+
+ recovery_partition = recovery_partition_factory(store_partition=store_partition)
+ recovery_partition.set_watermarks(0, 20)
+ assert recovery_partition.needs_recovery
+
+ def test_needs_recovery_caught_up(self, recovery_partition_factory):
+ store_partition = MagicMock(RocksDBStorePartition)
+ store_partition.get_changelog_offset.return_value = 10
+ recovery_partition = recovery_partition_factory(store_partition=store_partition)
+ recovery_partition.set_watermarks(0, 20)
+ store_partition.get_changelog_offset.return_value = 20
+ assert not recovery_partition.needs_recovery
+
+ def test_needs_recovery_no_valid_offsets(self, recovery_partition_factory):
+ # Create a RecoveryPartition with the offset ahead of the watermark
+ store_partition = MagicMock(RocksDBStorePartition)
+ store_partition.get_changelog_offset.return_value = 101
+
+ recovery_partition = recovery_partition_factory(store_partition=store_partition)
+ recovery_partition.set_watermarks(100, 100)
+ assert not recovery_partition.needs_recovery
+ assert recovery_partition.needs_offset_update
+
+ def test_recover_from_changelog_message(self, recovery_partition_factory):
+ store_partition = MagicMock(RocksDBStorePartition)
+ store_partition.get_changelog_offset.return_value = 10
+ recovery_partition = recovery_partition_factory(
+ store_partition=store_partition, committed_offset=1
+ )
+ recovery_partition.set_watermarks(10, 20)
+ msg = ConfluentKafkaMessageStub()
+ recovery_partition.recover_from_changelog_message(msg)
+
+ store_partition.recover_from_changelog_message.assert_called_with(
+ changelog_message=msg, committed_offset=1
+ )
+
+ def test_update_offset(self, recovery_partition_factory, caplog):
+ store_partition = MagicMock(RocksDBStorePartition)
+ store_partition.get_changelog_offset.return_value = 10
+ lowwater, highwater = 0, 9
+ recovery_partition = recovery_partition_factory(store_partition=store_partition)
+ recovery_partition.set_watermarks(lowwater, highwater)
+ recovery_partition.update_offset()
+
+ store_partition.set_changelog_offset.assert_called_with(
+ changelog_offset=highwater - 1
+ )
+ with caplog.at_level(level=logging.WARNING):
+ recovery_partition.update_offset()
+ assert caplog.text
diff --git a/tests/test_quixstreams/test_state/test_rocksdb/fixtures.py b/tests/test_quixstreams/test_state/test_rocksdb/fixtures.py
index 44a207dd8..b2176a560 100644
--- a/tests/test_quixstreams/test_state/test_rocksdb/fixtures.py
+++ b/tests/test_quixstreams/test_state/test_rocksdb/fixtures.py
@@ -1,45 +1,17 @@
import uuid
from typing import Optional
-from unittest.mock import create_autospec
+from unittest.mock import MagicMock, PropertyMock
import pytest
from quixstreams.state import ChangelogProducer, ChangelogProducerFactory
from quixstreams.state.rocksdb import RocksDBStore
from quixstreams.state.rocksdb.options import RocksDBOptions
-from quixstreams.state.rocksdb.partition import (
- RocksDBStorePartition,
-)
-
-
-TEST_KEYS = [
- "string",
- 123,
- 123.123,
- (123, 456),
-]
-
-TEST_VALUES = [
- None,
- "string",
- 123,
- 123.123,
- {"key": "value", "mapping": {"key": "value"}},
- [123, 456],
-]
-
-TEST_PREFIXES = [
- b"some_bytes",
- "string",
- 123,
- 123.123,
- (123, 456),
- [123, 456],
-]
+from quixstreams.state.rocksdb.partition import RocksDBStorePartition
@pytest.fixture()
-def rocksdb_partition_factory(tmp_path):
+def rocksdb_partition_factory(tmp_path, changelog_producer_mock):
def factory(
name: str = "db",
options: Optional[RocksDBOptions] = None,
@@ -47,13 +19,9 @@ def factory(
) -> RocksDBStorePartition:
path = (tmp_path / name).as_posix()
_options = options or RocksDBOptions(open_max_retries=0, open_retry_backoff=3.0)
- if not changelog_producer:
- changelog_producer = create_autospec(ChangelogProducer)(
- "topic", "partition", "producer"
- )
return RocksDBStorePartition(
path,
- changelog_producer=changelog_producer,
+ changelog_producer=changelog_producer or changelog_producer_mock,
options=_options,
)
@@ -90,3 +58,12 @@ def rocksdb_store(rocksdb_store_factory) -> RocksDBStore:
store = rocksdb_store_factory()
yield store
store.close()
+
+
+@pytest.fixture()
+def changelog_producer_mock():
+ producer = MagicMock(spec_set=ChangelogProducer)
+ type(producer).source_topic_name = PropertyMock(return_value="test-source-topic")
+ type(producer).changelog_name = PropertyMock(return_value="test-changelog-topic")
+ type(producer).partition = PropertyMock(return_value=0)
+ return producer
diff --git a/tests/test_quixstreams/test_state/test_rocksdb/test_partition.py b/tests/test_quixstreams/test_state/test_rocksdb/test_partition.py
index a1f718343..2531414e8 100644
--- a/tests/test_quixstreams/test_state/test_rocksdb/test_partition.py
+++ b/tests/test_quixstreams/test_state/test_rocksdb/test_partition.py
@@ -15,9 +15,10 @@
from quixstreams.state.rocksdb.metadata import (
CHANGELOG_CF_MESSAGE_HEADER,
PREFIX_SEPARATOR,
+ CHANGELOG_PROCESSED_OFFSET_MESSAGE_HEADER,
)
from quixstreams.utils.json import dumps
-from ...utils import ConfluentKafkaMessageStub
+from tests.utils import ConfluentKafkaMessageStub
class TestRocksDBStorePartition:
@@ -161,7 +162,9 @@ def test_ensure_metadata_cf(self, rocksdb_partition):
class TestRocksDBStorePartitionChangelog:
@pytest.mark.parametrize("store_value", [10, None])
- def test_recover_from_changelog_message(self, rocksdb_partition, store_value):
+ def test_recover_from_changelog_message_no_processed_offset(
+ self, rocksdb_partition, store_value
+ ):
"""
Tests both a put (10) and delete (None)
"""
@@ -174,11 +177,12 @@ def test_recover_from_changelog_message(self, rocksdb_partition, store_value):
offset=50,
)
- rocksdb_partition.recover_from_changelog_message(changelog_msg)
+ rocksdb_partition.recover_from_changelog_message(
+ changelog_msg, committed_offset=-1001
+ )
with rocksdb_partition.begin() as tx:
- with tx.with_prefix(kafka_key):
- assert tx.get(user_store_key) == store_value
+ assert tx.get(user_store_key, prefix=kafka_key) == store_value
assert rocksdb_partition.get_changelog_offset() == changelog_msg.offset() + 1
@pytest.mark.parametrize(
@@ -188,7 +192,7 @@ def test_recover_from_changelog_message(self, rocksdb_partition, store_value):
([], ColumnFamilyHeaderMissing),
],
)
- def test_recover_from_changelog_message_cf_errors(
+ def test_recover_from_changelog_message_missing_cf_headers(
self, rocksdb_partition, headers, error
):
changelog_msg = ConfluentKafkaMessageStub(
@@ -198,5 +202,86 @@ def test_recover_from_changelog_message_cf_errors(
offset=50,
)
with pytest.raises(error):
- rocksdb_partition.recover_from_changelog_message(changelog_msg)
+ rocksdb_partition.recover_from_changelog_message(
+ changelog_msg, committed_offset=-1001
+ )
assert rocksdb_partition.get_changelog_offset() is None
+
+ def test_recover_from_changelog_message_with_processed_offset_behind_committed(
+ self, rocksdb_partition
+ ):
+ """
+ Test that changes from the changelog topic are applied if the
+ source topic offset header is present and is smaller than the latest committed
+ offset.
+ """
+ kafka_key = b"my_key"
+ user_store_key = "count"
+
+ processed_offset_header = (
+ CHANGELOG_PROCESSED_OFFSET_MESSAGE_HEADER,
+ dumps(1),
+ )
+ committted_offset = 2
+ changelog_msg = ConfluentKafkaMessageStub(
+ key=kafka_key + PREFIX_SEPARATOR + dumps(user_store_key),
+ value=dumps(10),
+ headers=[
+ (CHANGELOG_CF_MESSAGE_HEADER, b"default"),
+ processed_offset_header,
+ ],
+ )
+
+ rocksdb_partition.recover_from_changelog_message(
+ changelog_msg, committed_offset=committted_offset
+ )
+
+ with rocksdb_partition.begin() as tx:
+ assert tx.get(user_store_key, prefix=kafka_key) == 10
+ assert rocksdb_partition.get_changelog_offset() == changelog_msg.offset() + 1
+
+ def test_recover_from_changelog_message_with_processed_offset_ahead_committed(
+ self, rocksdb_partition
+ ):
+ """
+ Test that changes from the changelog topic are NOT applied if the
+ source topic offset header is present but larger than the latest committed
+ offset.
+ It means that the changelog messages were produced during the checkpoint,
+ but the topic offset was not committed.
+ Possible reasons:
+ - Producer couldn't verify the delivery of every changelog message
+ - Consumer failed to commit the source topic offsets
+ """
+ kafka_key = b"my_key"
+ user_store_key = "count"
+ # Processed offset should be strictly lower than committed offset for
+ # the change to be applied
+ processed_offset = 2
+ committed_offset = 2
+
+ # Generate the changelog message with processed offset ahead of the committed
+ # one
+ processed_offset_header = (
+ CHANGELOG_PROCESSED_OFFSET_MESSAGE_HEADER,
+ dumps(processed_offset),
+ )
+ changelog_msg = ConfluentKafkaMessageStub(
+ key=kafka_key + PREFIX_SEPARATOR + dumps(user_store_key),
+ value=dumps(10),
+ headers=[
+ (CHANGELOG_CF_MESSAGE_HEADER, b"default"),
+ processed_offset_header,
+ ],
+ )
+
+ # Recover from the message
+ rocksdb_partition.recover_from_changelog_message(
+ changelog_msg, committed_offset=committed_offset
+ )
+
+ # Check that the changes have not been applied, but the changelog offset
+ # increased
+ with rocksdb_partition.begin() as tx:
+ assert tx.get(user_store_key, prefix=kafka_key) is None
+ assert rocksdb_partition.get_changelog_offset() == changelog_msg.offset() + 1
diff --git a/tests/test_quixstreams/test_state/test_rocksdb/test_store.py b/tests/test_quixstreams/test_state/test_rocksdb/test_store.py
index f82030f3c..adda48011 100644
--- a/tests/test_quixstreams/test_state/test_rocksdb/test_store.py
+++ b/tests/test_quixstreams/test_state/test_rocksdb/test_store.py
@@ -26,15 +26,16 @@ def test_revoke_partition_not_assigned(self, rocksdb_store):
rocksdb_store.revoke_partition(0)
def test_create_transaction(self, rocksdb_store):
+ prefix = b"__key__"
rocksdb_store.assign_partition(0)
with rocksdb_store.start_partition_transaction(0) as tx:
- tx.set("key", "value")
+ tx.set("key", "value", prefix=prefix)
rocksdb_store.revoke_partition(0)
# Assign partition again and check the value
rocksdb_store.assign_partition(0)
with rocksdb_store.start_partition_transaction(0) as tx:
- assert tx.get("key") == "value"
+ assert tx.get("key", prefix=prefix) == "value"
assert rocksdb_store._changelog_producer_factory is None
def test_get_transaction_partition_not_assigned(self, rocksdb_store):
diff --git a/tests/test_quixstreams/test_state/test_rocksdb/test_transaction.py b/tests/test_quixstreams/test_state/test_rocksdb/test_transaction.py
index d2a5da1ac..f8099e486 100644
--- a/tests/test_quixstreams/test_state/test_rocksdb/test_transaction.py
+++ b/tests/test_quixstreams/test_state/test_rocksdb/test_transaction.py
@@ -1,7 +1,7 @@
import contextlib
import secrets
from datetime import datetime
-from unittest.mock import patch, call
+from unittest.mock import patch
import pytest
import rocksdict
@@ -10,15 +10,41 @@
StateSerializationError,
StateTransactionError,
RocksDBStorePartition,
- NestedPrefixError,
RocksDBOptions,
+ RocksDBPartitionTransaction,
+ InvalidChangelogOffset,
)
from quixstreams.state.rocksdb.metadata import (
CHANGELOG_CF_MESSAGE_HEADER,
+ CHANGELOG_PROCESSED_OFFSET_MESSAGE_HEADER,
)
from quixstreams.state.rocksdb.serialization import serialize
from quixstreams.utils.json import dumps
-from .fixtures import TEST_KEYS, TEST_VALUES, TEST_PREFIXES
+
+TEST_KEYS = [
+ "string",
+ 123,
+ 123.123,
+ (123, 456),
+]
+
+TEST_VALUES = [
+ None,
+ "string",
+ 123,
+ 123.123,
+ {"key": "value", "mapping": {"key": "value"}},
+ [123, 456],
+]
+
+TEST_PREFIXES = [
+ b"some_bytes",
+ "string",
+ 123,
+ 123.123,
+ (123, 456),
+ [123, 456],
+]
class TestRocksDBPartitionTransaction:
@@ -28,136 +54,33 @@ def test_transaction_complete(self, rocksdb_partition):
assert tx.completed
- def test_transaction_with_changelog(self, rocksdb_partition):
- changelog_producer = rocksdb_partition._changelog_producer
- key_out = "my_key"
- value_out = "my_value"
- cf = "default"
- db_writes = 3
- assert rocksdb_partition.get_changelog_offset() is None
-
- with rocksdb_partition.begin() as tx:
- for i in range(db_writes):
- tx.set(key=f"{key_out}{i}", value=f"{value_out}{i}", cf_name=cf)
-
- changelog_producer.produce.assert_has_calls(
- [
- call(
- key=tx._serialize_key(key=f"{key_out}{i}"),
- value=tx._serialize_value(value=f"{value_out}{i}"),
- headers={CHANGELOG_CF_MESSAGE_HEADER: cf},
- )
- for i in range(db_writes)
- ]
- )
- assert changelog_producer.produce.call_count == db_writes
- assert tx.completed
- assert rocksdb_partition.get_changelog_offset() == db_writes
-
- def test_transaction_with_changelog_delete(self, rocksdb_partition):
- changelog_producer = rocksdb_partition._changelog_producer
- key_out = "my_key"
- value_out = "my_value"
- cf = "default"
- assert rocksdb_partition.get_changelog_offset() is None
-
- with rocksdb_partition.begin() as tx:
- tx.set(key=key_out, value=value_out, cf_name=cf)
-
- with rocksdb_partition.begin() as tx:
- tx.delete(key=key_out, cf_name=cf)
-
- changelog_producer.produce.assert_has_calls(
- [
- call(
- key=tx._serialize_key(key=key_out),
- value=tx._serialize_value(value=value_out),
- headers={CHANGELOG_CF_MESSAGE_HEADER: cf},
- ),
- call(
- key=tx._serialize_key(key=key_out),
- value=None,
- headers={CHANGELOG_CF_MESSAGE_HEADER: cf},
- ),
- ]
- )
- assert changelog_producer.produce.call_count == 2
- assert tx.completed
- assert rocksdb_partition.get_changelog_offset() == 2
-
- def test_transaction_with_changelog_delete_cached(self, rocksdb_partition):
- changelog_producer = rocksdb_partition._changelog_producer
- key_out = "my_key"
- value_out = "my_value"
- cf = "default"
- db_writes = 3
- delete_index = 2
- assert rocksdb_partition.get_changelog_offset() is None
-
- with rocksdb_partition.begin() as tx:
- for i in range(db_writes):
- tx.set(key=f"{key_out}{i}", value=f"{value_out}{i}", cf_name=cf)
- tx.delete(key=f"{key_out}{delete_index}", cf_name=cf)
-
- changelog_producer.produce.assert_has_calls(
- [
- call(
- key=tx._serialize_key(key=f"{key_out}{i}"),
- value=tx._serialize_value(value=f"{value_out}{i}"),
- headers={CHANGELOG_CF_MESSAGE_HEADER: cf},
- )
- for i in range(db_writes - 1)
- ]
- + [
- call(
- key=tx._serialize_key(key=f"{key_out}{delete_index}"),
- value=None,
- headers={CHANGELOG_CF_MESSAGE_HEADER: cf},
- )
- ]
- )
- assert changelog_producer.produce.call_count == db_writes
- assert tx.completed
- assert rocksdb_partition.get_changelog_offset() == db_writes
-
- def test_transaction_with_changelog_delete_nonexisting_key(self, rocksdb_partition):
- changelog_producer = rocksdb_partition._changelog_producer
- key_out = "my_key"
- cf = "default"
- assert rocksdb_partition.get_changelog_offset() is None
-
- with rocksdb_partition.begin() as tx:
- tx.delete(key=key_out, cf_name=cf)
-
- changelog_producer.produce.assert_called_with(
- key=tx._serialize_key(key=key_out),
- value=None,
- headers={CHANGELOG_CF_MESSAGE_HEADER: cf},
- )
-
- assert tx.completed
- assert rocksdb_partition.get_changelog_offset() == 1
-
- def test_transaction_doesnt_write_empty_batch(self, rocksdb_partition):
+ def test_transaction_doesnt_write_empty_batch(
+ self, changelog_producer_mock, rocksdb_partition_factory
+ ):
"""
Test that transaction doesn't call "StateStore.write()" if the internal
WriteBatch is empty (i.e. no keys were updated during the transaction).
Writing empty batches costs more than doing
"""
- changelog_producer = rocksdb_partition._changelog_producer
- with patch.object(RocksDBStorePartition, "write") as mocked:
- with rocksdb_partition.begin() as tx:
- tx.get("key")
- with rocksdb_partition.begin() as tx:
- tx.get("key")
+ prefix = b"__key__"
+ with rocksdb_partition_factory(
+ changelog_producer=changelog_producer_mock
+ ) as partition:
+ with patch.object(RocksDBStorePartition, "write") as mocked:
+ with partition.begin() as tx:
+ tx.get("key", prefix=prefix)
+
+ with partition.begin() as tx:
+ tx.get("key", prefix=prefix)
assert not mocked.called
- assert not changelog_producer.produce.called
+ assert not changelog_producer_mock.produce.called
def test_delete_key_doesnt_exist(self, rocksdb_partition):
+ prefix = b"__key__"
with rocksdb_partition.begin() as tx:
- tx.delete("key")
+ tx.delete("key", prefix=prefix)
@pytest.mark.parametrize(
"key",
@@ -168,9 +91,10 @@ def test_delete_key_doesnt_exist(self, rocksdb_partition):
TEST_VALUES,
)
def test_get_key_exists_cached(self, key, value, rocksdb_partition):
+ prefix = b"__key__"
with rocksdb_partition.begin() as tx:
- tx.set(key, value)
- stored = tx.get(key)
+ tx.set(key, value, prefix=prefix)
+ stored = tx.get(key, prefix=prefix)
assert stored == value
@pytest.mark.parametrize(
@@ -182,65 +106,76 @@ def test_get_key_exists_cached(self, key, value, rocksdb_partition):
TEST_VALUES,
)
def test_get_key_exists_no_cache(self, key, value, rocksdb_partition):
+ prefix = b"__key__"
with rocksdb_partition.begin() as tx:
- tx.set(key, value)
+ tx.set(key, value, prefix=prefix)
+
with rocksdb_partition.begin() as tx:
- stored = tx.get(key, value)
+ stored = tx.get(key, prefix=prefix)
assert stored == value
def test_get_key_doesnt_exist_default(self, rocksdb_partition):
+ prefix = b"__key__"
with rocksdb_partition.begin() as tx:
- value = tx.get("key", default=123)
+ value = tx.get("key", default=123, prefix=prefix)
assert value == 123
def test_delete_key_cached_no_flush(self, rocksdb_partition):
+ prefix = b"__key__"
with rocksdb_partition.begin() as tx:
- tx.set("key", "value")
- assert tx.get("key") == "value"
- tx.delete("key")
- assert tx.get("key") is None
+ tx.set("key", "value", prefix=prefix)
+ assert tx.get("key", prefix=prefix) == "value"
+ tx.delete("key", prefix=prefix)
+ assert tx.get("key", prefix=prefix) is None
def test_delete_key_cached(self, rocksdb_partition):
+ prefix = b"__key__"
with rocksdb_partition.begin() as tx:
- tx.set("key", "value")
+ tx.set("key", "value", prefix=prefix)
with rocksdb_partition.begin() as tx:
- assert tx.get("key") == "value"
- tx.delete("key")
- assert tx.get("key") is None
+ assert tx.get("key", prefix=prefix) == "value"
+ tx.delete("key", prefix=prefix)
+ assert tx.get("key", prefix=prefix) is None
def test_delete_key_no_cache(self, rocksdb_partition):
+ prefix = b"__key__"
with rocksdb_partition.begin() as tx:
- tx.set("key", "value")
- assert tx.get("key") == "value"
+ tx.set("key", "value", prefix=prefix)
+ assert tx.get("key", prefix=prefix) == "value"
with rocksdb_partition.begin() as tx:
- tx.delete("key")
+ tx.delete("key", prefix=prefix)
with rocksdb_partition.begin() as tx:
- assert tx.get("key") is None
+ assert tx.get("key", prefix=prefix) is None
def test_key_exists_cached(self, rocksdb_partition):
+ prefix = b"__key__"
with rocksdb_partition.begin() as tx:
- tx.set("key", "value")
- assert tx.exists("key")
- assert not tx.exists("key123")
+ tx.set("key", "value", prefix=prefix)
+ assert tx.exists("key", prefix=prefix)
+ assert not tx.exists("key123", prefix=prefix)
def test_key_exists_no_cache(self, rocksdb_partition):
+ prefix = b"__key__"
+
with rocksdb_partition.begin() as tx:
- tx.set("key", "value")
+ tx.set("key", "value", prefix=prefix)
+
with rocksdb_partition.begin() as tx:
- assert tx.exists("key")
- assert not tx.exists("key123")
+ assert tx.exists("key", prefix=prefix)
+ assert not tx.exists("key123", prefix=prefix)
def test_key_exists_deleted_in_cache(self, rocksdb_partition):
+ prefix = b"__key__"
with rocksdb_partition.begin() as tx:
- tx.set("key", "value")
+ tx.set("key", "value", prefix=prefix)
with rocksdb_partition.begin() as tx:
- assert tx.exists("key")
- tx.delete("key")
- assert not tx.exists("key")
+ assert tx.exists("key", prefix=prefix)
+ tx.delete("key", prefix=prefix)
+ assert not tx.exists("key", prefix=prefix)
@pytest.mark.parametrize(
"key, value",
@@ -252,15 +187,17 @@ def test_key_exists_deleted_in_cache(self, rocksdb_partition):
],
)
def test_set_serialization_error(self, key, value, rocksdb_partition):
+ prefix = b"__key__"
with rocksdb_partition.begin() as tx:
with pytest.raises(StateSerializationError):
- tx.set(key, value)
+ tx.set(key, value, prefix=prefix)
@pytest.mark.parametrize("key", [object(), b"somebytes", datetime.utcnow()])
def test_delete_serialization_error(self, key, rocksdb_partition):
+ prefix = b"__key__"
with rocksdb_partition.begin() as tx:
with pytest.raises(StateSerializationError):
- tx.delete(key)
+ tx.delete(key, prefix=prefix)
def test_get_deserialization_error(self, rocksdb_partition):
bytes_ = secrets.token_bytes(10)
@@ -275,46 +212,33 @@ def test_get_deserialization_error(self, rocksdb_partition):
with rocksdb_partition.begin() as tx:
with pytest.raises(StateSerializationError):
- tx.get(string_)
+ tx.get(string_, prefix=b"")
with pytest.raises(StateSerializationError):
- tx.get(bytes_)
-
- @pytest.mark.parametrize("prefix", TEST_PREFIXES)
- def test_set_key_with_prefix_no_cache(self, prefix, rocksdb_partition):
- with rocksdb_partition.begin() as tx:
- with tx.with_prefix(prefix):
- tx.set("key", "value")
-
- with rocksdb_partition.begin() as tx:
- with tx.with_prefix(prefix):
- assert tx.get("key") == "value"
-
- with rocksdb_partition.begin() as tx:
- assert tx.get("key") is None
+ tx.get(bytes_, prefix=b"")
- @pytest.mark.parametrize("prefix", TEST_PREFIXES)
- def test_delete_key_with_prefix_no_cache(self, prefix, rocksdb_partition):
+ def test_set_key_different_prefixes(self, rocksdb_partition):
+ prefix1, prefix2 = b"__key1__", b"__key2__"
with rocksdb_partition.begin() as tx:
- with tx.with_prefix(prefix):
- tx.set("key", "value")
+ tx.set("key", "value", prefix=prefix1)
+ assert tx.get("key", prefix=prefix1) == "value"
+ assert tx.get("key", prefix=prefix2) is None
+ def test_delete_key_different_prefixes_no_cache(self, rocksdb_partition):
+ prefix1, prefix2 = b"__key1__", b"__key2__"
with rocksdb_partition.begin() as tx:
- with tx.with_prefix(prefix):
- assert tx.get("key") == "value"
-
- with rocksdb_partition.begin() as tx:
- with tx.with_prefix(prefix):
- tx.delete("key")
-
- with rocksdb_partition.begin() as tx:
- with tx.with_prefix(prefix):
- assert tx.get("key") is None
+ tx.set("key", "value", prefix=prefix1)
+ tx.set("key", "value", prefix=prefix2)
+ assert tx.get("key", prefix=prefix1) == "value"
+ assert tx.get("key", prefix=prefix2) == "value"
+ tx.delete("key", prefix=prefix1)
+ assert tx.get("key", prefix=prefix1) is None
+ assert tx.get("key", prefix=prefix2) is not None
@pytest.mark.parametrize(
"operation",
[
- lambda tx: tx.set("key", "value"),
- lambda tx: tx.delete("key"),
+ lambda tx, prefix: tx.set("key", "value", prefix=prefix),
+ lambda tx, prefix: tx.delete("key", prefix=prefix),
],
)
def test_update_key_failed_transaction_failed(self, operation, rocksdb_partition):
@@ -322,143 +246,296 @@ def test_update_key_failed_transaction_failed(self, operation, rocksdb_partition
Test that if the update operation (set or delete) fails the transaction is
marked as failed and cannot be re-used anymore.
"""
+
+ prefix = b"__key__"
with patch.object(
- rocksdict.WriteBatch, "put", side_effect=ValueError("test")
- ), patch.object(rocksdict.WriteBatch, "delete", side_effect=ValueError("test")):
+ RocksDBPartitionTransaction,
+ "_serialize_key",
+ side_effect=ValueError("test"),
+ ):
with rocksdb_partition.begin() as tx:
with contextlib.suppress(ValueError):
- operation(tx=tx)
+ operation(tx=tx, prefix=prefix)
assert tx.failed
# Ensure that Transaction cannot be used after it's failed
with pytest.raises(StateTransactionError):
- tx.set("key", "value")
+ tx.set("key", "value", prefix=prefix)
with pytest.raises(StateTransactionError):
- tx.get("key")
+ tx.get("key", prefix=prefix)
with pytest.raises(StateTransactionError):
- tx.delete("key")
+ tx.delete("key", prefix=prefix)
with pytest.raises(StateTransactionError):
- tx.exists("key")
+ tx.exists("key", prefix=prefix)
with pytest.raises(StateTransactionError):
- tx.maybe_flush()
+ tx.flush()
assert not tx.completed
- def test_flush_failed_transaction_failed(self, rocksdb_partition):
+ def test_update_key_prepared_transaction_fails(self, rocksdb_partition):
"""
- Test that if the "maybe_flush()" fails the transaction is also marked
- as failed and cannot be re-used anymore.
+ Test that any update operation (set or delete) fails if the transaction is
+ marked as prepared.
"""
- with patch.object(
- RocksDBStorePartition, "write", side_effect=ValueError("test")
- ):
- with rocksdb_partition.begin() as tx:
- tx.set("key", "value")
-
- with contextlib.suppress(ValueError):
- tx.maybe_flush()
-
- assert tx.failed
-
- # Ensure that Transaction cannot be used after it's failed
- with pytest.raises(StateTransactionError):
- tx.set("key", "value")
+ prefix = b"__key__"
+ tx = rocksdb_partition.begin()
- with pytest.raises(StateTransactionError):
- tx.get("key")
+ tx.set(key="key", value="value", prefix=prefix)
+ tx.prepare(processed_offset=1)
+ assert tx.prepared
- with pytest.raises(StateTransactionError):
- tx.delete("key")
+ with pytest.raises(StateTransactionError):
+ tx.set("key", value="value", prefix=prefix)
- with pytest.raises(StateTransactionError):
- tx.exists("key")
-
- assert tx.completed
+ with pytest.raises(StateTransactionError):
+ tx.delete("key", prefix=prefix)
def test_transaction_not_flushed_on_error(self, rocksdb_partition):
+ prefix = b"__key__"
with contextlib.suppress(ValueError):
with rocksdb_partition.begin() as tx:
- tx.set("key", "value")
+ tx.set("key", "value", prefix=prefix)
raise ValueError("test")
with rocksdb_partition.begin() as tx:
- assert tx.get("key") is None
-
- def test_nested_prefixes_fail(self, rocksdb_partition):
- tx = rocksdb_partition.begin()
- with pytest.raises(NestedPrefixError):
- with tx.with_prefix("prefix"):
- with tx.with_prefix("prefix"):
- ...
+ assert tx.get("key", prefix=prefix) is None
def test_custom_dumps_loads(self, rocksdb_partition_factory):
key = secrets.token_bytes(10)
value = secrets.token_bytes(10)
+ prefix = b"__key__"
with rocksdb_partition_factory(
options=RocksDBOptions(loads=lambda v: v, dumps=lambda v: v)
) as db:
with db.begin() as tx:
- tx.set(key, value)
+ tx.set(key, value, prefix=prefix)
with db.begin() as tx:
- assert tx.get(key) == value
+ assert tx.get(key, prefix=prefix) == value
def test_set_dict_nonstr_keys_fails(self, rocksdb_partition):
key = "key"
value = {0: 1}
+ prefix = b"__key__"
with rocksdb_partition.begin() as tx:
with pytest.raises(StateSerializationError):
- tx.set(key, value)
+ tx.set(key, value, prefix=prefix)
def test_set_datetime_fails(self, rocksdb_partition):
key = "key"
value = datetime.utcnow()
+ prefix = b"__key__"
with rocksdb_partition.begin() as tx:
with pytest.raises(StateSerializationError):
- tx.set(key, value)
+ tx.set(key, value, prefix=prefix)
def test_set_get_with_column_family(self, rocksdb_partition):
key = "key"
value = "value"
+ prefix = b"__key__"
rocksdb_partition.create_column_family("cf")
with rocksdb_partition.begin() as tx:
- tx.set(key, value, cf_name="cf")
- assert tx.get(key, cf_name="cf") == value
+ tx.set(key, value, cf_name="cf", prefix=prefix)
+ assert tx.get(key, cf_name="cf", prefix=prefix) == value
with rocksdb_partition.begin() as tx:
- assert tx.get(key, cf_name="cf") == value
+ assert tx.get(key, cf_name="cf", prefix=prefix) == value
def test_set_delete_get_with_column_family(self, rocksdb_partition):
key = "key"
value = "value"
+ prefix = b"__key__"
rocksdb_partition.create_column_family("cf")
with rocksdb_partition.begin() as tx:
- tx.set(key, value, cf_name="cf")
- assert tx.get(key, cf_name="cf") == value
- tx.delete(key, cf_name="cf")
- assert tx.get(key, cf_name="cf") is None
+ tx.set(key, value, cf_name="cf", prefix=prefix)
+ assert tx.get(key, cf_name="cf", prefix=prefix) == value
+ tx.delete(key, cf_name="cf", prefix=prefix)
+ assert tx.get(key, cf_name="cf", prefix=prefix) is None
with rocksdb_partition.begin() as tx:
- assert tx.get(key, cf_name="cf") is None
+ assert tx.get(key, cf_name="cf", prefix=prefix) is None
def test_set_exists_get_with_column_family(self, rocksdb_partition):
key = "key"
value = "value"
rocksdb_partition.create_column_family("cf")
+ prefix = b"__key__"
with rocksdb_partition.begin() as tx:
- assert not tx.exists(key, cf_name="cf")
- tx.set(key, value, cf_name="cf")
- assert tx.exists(key, cf_name="cf")
+ assert not tx.exists(key, cf_name="cf", prefix=prefix)
+ tx.set(key, value, cf_name="cf", prefix=prefix)
+ assert tx.exists(key, cf_name="cf", prefix=prefix)
with rocksdb_partition.begin() as tx:
- assert tx.exists(key, cf_name="cf")
+ assert tx.exists(key, cf_name="cf", prefix=prefix)
+
+ def test_flush_failed_transaction_failed(self, rocksdb_partition):
+ """
+ Test that if the "flush()" fails the transaction is also marked
+ as failed and cannot be re-used.
+ """
+
+ prefix = b"__key__"
+ with patch.object(
+ RocksDBStorePartition, "write", side_effect=ValueError("test")
+ ):
+ with rocksdb_partition.begin() as tx:
+ tx.set("key", "value", prefix=prefix)
+
+ with contextlib.suppress(ValueError):
+ tx.flush()
+
+ assert tx.failed
+
+ # Ensure that Transaction cannot be used after it's failed
+ with pytest.raises(StateTransactionError):
+ tx.set("key", "value", prefix=prefix)
+
+ with pytest.raises(StateTransactionError):
+ tx.get("key", prefix=prefix)
+
+ with pytest.raises(StateTransactionError):
+ tx.delete("key", prefix=prefix)
+
+ with pytest.raises(StateTransactionError):
+ tx.exists("key", prefix=prefix)
+
+ @pytest.mark.parametrize(
+ "processed_offset, changelog_offset", [(None, None), (1, 1)]
+ )
+ def test_flush_success(self, processed_offset, changelog_offset, rocksdb_partition):
+ tx = rocksdb_partition.begin()
+
+ # Set some key to probe the transaction
+ tx.set(key="key", value="value", prefix=b"__key__")
+
+ tx.flush(processed_offset=processed_offset, changelog_offset=changelog_offset)
+ assert tx.completed
+
+ assert rocksdb_partition.get_changelog_offset() == changelog_offset
+ assert rocksdb_partition.get_processed_offset() == processed_offset
+
+ def test_flush_invalid_changelog_offset(self, rocksdb_partition):
+ tx1 = rocksdb_partition.begin()
+ # Set some key to probe the transaction
+ tx1.set(key="key", value="value", prefix=b"__key__")
+
+ # Flush first transaction to update the changelog offset
+ tx1.flush(changelog_offset=9999)
+ assert tx1.completed
+
+ tx2 = rocksdb_partition.begin()
+ tx2.set(key="key", value="value", prefix=b"__key__")
+ # Flush second transaction with a smaller changelog offset
+ with pytest.raises(InvalidChangelogOffset):
+ tx2.flush(changelog_offset=1)
+ assert tx2.failed
+
+ def test_set_and_prepare(self, rocksdb_partition_factory, changelog_producer_mock):
+ data = [
+ ("key1", "value1"),
+ ("key2", "value2"),
+ ("key3", "value3"),
+ ]
+ cf = "default"
+ prefix = b"__key__"
+ processed_offset = 1
+
+ with rocksdb_partition_factory(
+ changelog_producer=changelog_producer_mock
+ ) as partition:
+ tx = partition.begin()
+ for key, value in data:
+ tx.set(
+ key=key,
+ value=value,
+ cf_name=cf,
+ prefix=prefix,
+ )
+ tx.prepare(processed_offset=processed_offset)
+
+ assert changelog_producer_mock.produce.call_count == len(data)
+
+ for (key, value), call in zip(
+ data, changelog_producer_mock.produce.call_args_list
+ ):
+ assert call.kwargs["key"] == tx._serialize_key(key=key, prefix=prefix)
+ assert call.kwargs["value"] == tx._serialize_value(value=value)
+ assert call.kwargs["headers"] == {
+ CHANGELOG_CF_MESSAGE_HEADER: cf,
+ CHANGELOG_PROCESSED_OFFSET_MESSAGE_HEADER: dumps(processed_offset),
+ }
+
+ assert tx.prepared
+
+ def test_delete_and_prepare(
+ self, rocksdb_partition_factory, changelog_producer_mock
+ ):
+ key, value = "key", "value"
+ cf = "default"
+ prefix = b"__key__"
+ processed_offset = 1
+
+ with rocksdb_partition_factory(
+ changelog_producer=changelog_producer_mock
+ ) as partition:
+
+ tx = partition.begin()
+ tx.delete(key=key, cf_name=cf, prefix=prefix)
+
+ tx.prepare(processed_offset=processed_offset)
+
+ assert tx.prepared
+ assert changelog_producer_mock.produce.call_count == 1
+
+ delete_changelog = changelog_producer_mock.produce.call_args_list[0]
+ assert delete_changelog.kwargs["key"] == tx._serialize_key(
+ key=key, prefix=prefix
+ )
+ assert delete_changelog.kwargs["value"] is None
+ assert delete_changelog.kwargs["headers"] == {
+ CHANGELOG_CF_MESSAGE_HEADER: cf,
+ CHANGELOG_PROCESSED_OFFSET_MESSAGE_HEADER: dumps(processed_offset),
+ }
+
+ def test_set_delete_and_prepare(
+ self, rocksdb_partition_factory, changelog_producer_mock
+ ):
+ """
+ Test that only "delete" changelog message is emited if the key is set
+ and deleted in the same transaction.
+ """
+ key, value = "key", "value"
+ cf = "default"
+ prefix = b"__key__"
+ processed_offset = 1
+
+ with rocksdb_partition_factory(
+ changelog_producer=changelog_producer_mock
+ ) as partition:
+ tx = partition.begin()
+ tx.set(key=key, value=value, cf_name=cf, prefix=prefix)
+ tx.delete(key=key, cf_name=cf, prefix=prefix)
+
+ tx.prepare(processed_offset=processed_offset)
+
+ assert tx.prepared
+ assert changelog_producer_mock.produce.call_count == 1
+ delete_changelog = changelog_producer_mock.produce.call_args_list[0]
+ assert delete_changelog.kwargs["key"] == tx._serialize_key(
+ key=key, prefix=prefix
+ )
+ assert delete_changelog.kwargs["value"] is None
+ assert delete_changelog.kwargs["headers"] == {
+ CHANGELOG_CF_MESSAGE_HEADER: cf,
+ CHANGELOG_PROCESSED_OFFSET_MESSAGE_HEADER: dumps(processed_offset),
+ }
diff --git a/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/fixtures.py b/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/fixtures.py
index 727d8b31a..e147b5844 100644
--- a/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/fixtures.py
+++ b/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/fixtures.py
@@ -5,7 +5,9 @@
import pytest
from quixstreams.rowproducer import RowProducer
-from quixstreams.state.recovery import ChangelogProducerFactory
+from quixstreams.state.recovery import ChangelogProducerFactory, ChangelogProducer
+from quixstreams.state.rocksdb import RocksDBOptions
+from quixstreams.state.rocksdb.windowed.partition import WindowedRocksDBStorePartition
from quixstreams.state.rocksdb.windowed.store import WindowedRocksDBStore
@@ -26,15 +28,38 @@ def factory(
@pytest.fixture()
-def windowed_rocksdb_store_factory_changelog(tmp_path):
+def windowed_rocksdb_partition_factory(tmp_path):
+ def factory(
+ name: str = "db",
+ options: Optional[RocksDBOptions] = None,
+ changelog_producer: Optional[ChangelogProducer] = None,
+ ) -> WindowedRocksDBStorePartition:
+ path = (tmp_path / name).as_posix()
+ _options = options or RocksDBOptions(open_max_retries=0, open_retry_backoff=3.0)
+ if not changelog_producer:
+ changelog_producer = create_autospec(ChangelogProducer)(
+ "topic", "partition", "producer"
+ )
+ return WindowedRocksDBStorePartition(
+ path,
+ changelog_producer=changelog_producer,
+ options=_options,
+ )
+
+ return factory
+
+
+@pytest.fixture()
+def windowed_rocksdb_store_factory_changelog(tmp_path, changelog_producer_mock):
def factory(
topic: Optional[str] = None,
changelog: Optional[str] = None,
name: str = "default",
producer: Optional[RowProducer] = None,
) -> WindowedRocksDBStore:
+ topic = topic or str(uuid.uuid4())
return WindowedRocksDBStore(
- topic=topic or str(uuid.uuid4()),
+ topic=topic,
name=name,
base_dir=str(tmp_path),
changelog_producer_factory=ChangelogProducerFactory(
diff --git a/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/test_partition.py b/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/test_partition.py
index 2c7679e36..47e4ce176 100644
--- a/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/test_partition.py
+++ b/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/test_partition.py
@@ -10,7 +10,7 @@
)
from quixstreams.state.rocksdb.windowed.serialization import encode_window_key
from quixstreams.utils.json import dumps
-from tests.test_quixstreams.utils import ConfluentKafkaMessageStub
+from tests.utils import ConfluentKafkaMessageStub
class TestWindowedRocksDBPartitionTransactionChangelog:
@@ -37,13 +37,14 @@ def test_recover_window_from_changelog_message(
offset=50,
)
- store_partition.recover_from_changelog_message(changelog_msg)
-
+ store_partition.recover_from_changelog_message(
+ changelog_msg, committed_offset=-1001
+ )
with store_partition.begin() as tx:
- with tx.with_prefix(kafka_key):
- assert (
- tx.get_window(window["start_ms"], window["end_ms"]) == store_value
- )
+ assert (
+ tx.get_window(window["start_ms"], window["end_ms"], prefix=kafka_key)
+ == store_value
+ )
assert store_partition.get_changelog_offset() == changelog_msg.offset() + 1
def test_recover_latest_expire_from_changelog_message(
@@ -67,15 +68,17 @@ def test_recover_latest_expire_from_changelog_message(
offset=50,
)
- store_partition.recover_from_changelog_message(changelog_msg)
+ store_partition.recover_from_changelog_message(
+ changelog_msg, committed_offset=-1001
+ )
with store_partition.begin() as tx:
- with tx.with_prefix(kafka_key):
- assert (
- tx.get(
- LATEST_EXPIRED_WINDOW_TIMESTAMP_KEY,
- cf_name=LATEST_EXPIRED_WINDOW_CF_NAME,
- )
- == store_value
+ assert (
+ tx.get(
+ LATEST_EXPIRED_WINDOW_TIMESTAMP_KEY,
+ cf_name=LATEST_EXPIRED_WINDOW_CF_NAME,
+ prefix=kafka_key,
)
+ == store_value
+ )
assert store_partition.get_changelog_offset() == changelog_msg.offset() + 1
diff --git a/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/test_state.py b/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/test_state.py
new file mode 100644
index 000000000..b48740faf
--- /dev/null
+++ b/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/test_state.py
@@ -0,0 +1,56 @@
+class TestWindowedRocksDBPartitionTransactionState:
+ def test_update_window(self, windowed_rocksdb_store_factory):
+ store = windowed_rocksdb_store_factory()
+ store.assign_partition(0)
+ prefix = b"__key__"
+ with store.start_partition_transaction(0) as tx:
+ state = tx.as_state(prefix=prefix)
+ state.update_window(start_ms=0, end_ms=10, value=1, timestamp_ms=2)
+ assert state.get_window(start_ms=0, end_ms=10) == 1
+
+ with store.start_partition_transaction(0) as tx:
+ state = tx.as_state(prefix=prefix)
+ assert state.get_window(start_ms=0, end_ms=10) == 1
+
+ def test_expire_windows(self, windowed_rocksdb_store_factory):
+ store = windowed_rocksdb_store_factory()
+ store.assign_partition(0)
+ prefix = b"__key__"
+ with store.start_partition_transaction(0) as tx:
+ state = tx.as_state(prefix=prefix)
+ state.update_window(start_ms=0, end_ms=10, value=1, timestamp_ms=2)
+ state.update_window(start_ms=10, end_ms=20, value=2, timestamp_ms=10)
+
+ with store.start_partition_transaction(0) as tx:
+ state = tx.as_state(prefix=prefix)
+ state.update_window(start_ms=20, end_ms=30, value=3, timestamp_ms=20)
+ expired = state.expire_windows(duration_ms=10)
+ # "expire_windows" must update the expiration index so that the same
+ # windows are not expired twice
+ assert not state.expire_windows(duration_ms=10)
+
+ assert len(expired) == 2
+ assert expired == [
+ ((0, 10), 1),
+ ((10, 20), 2),
+ ]
+
+ with store.start_partition_transaction(0) as tx:
+ state = tx.as_state(prefix=prefix)
+ assert state.get_window(start_ms=0, end_ms=10) is None
+ assert state.get_window(start_ms=10, end_ms=20) is None
+ assert state.get_window(start_ms=20, end_ms=30) == 3
+
+ def test_get_latest_timestamp(self, windowed_rocksdb_store_factory):
+ store = windowed_rocksdb_store_factory()
+ partition = store.assign_partition(0)
+ timestamp = 123
+ prefix = b"__key__"
+ with partition.begin() as tx:
+ state = tx.as_state(prefix)
+ state.update_window(0, 10, value=1, timestamp_ms=timestamp)
+ store.revoke_partition(0)
+
+ partition = store.assign_partition(0)
+ with partition.begin() as tx:
+ assert tx.get_latest_timestamp() == timestamp
diff --git a/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/test_transaction.py b/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/test_transaction.py
index 2791c4c2e..20299a116 100644
--- a/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/test_transaction.py
+++ b/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/test_transaction.py
@@ -1,66 +1,68 @@
-from unittest.mock import call
-
import pytest
from quixstreams.state.rocksdb.metadata import (
CHANGELOG_CF_MESSAGE_HEADER,
- PREFIX_SEPARATOR,
-)
-from quixstreams.state.rocksdb.windowed.metadata import (
- LATEST_EXPIRED_WINDOW_CF_NAME,
- LATEST_EXPIRED_WINDOW_TIMESTAMP_KEY,
+ CHANGELOG_PROCESSED_OFFSET_MESSAGE_HEADER,
)
from quixstreams.state.rocksdb.windowed.serialization import encode_window_key
+from quixstreams.utils.json import dumps
class TestWindowedRocksDBPartitionTransaction:
def test_update_window(self, windowed_rocksdb_store_factory):
store = windowed_rocksdb_store_factory()
store.assign_partition(0)
+ prefix = b"__key__"
with store.start_partition_transaction(0) as tx:
- with tx.with_prefix(b"__key__"):
- tx.update_window(start_ms=0, end_ms=10, value=1, timestamp_ms=2)
- assert tx.get_window(start_ms=0, end_ms=10) == 1
+ tx.update_window(
+ start_ms=0, end_ms=10, value=1, timestamp_ms=2, prefix=prefix
+ )
+ assert tx.get_window(start_ms=0, end_ms=10, prefix=prefix) == 1
with store.start_partition_transaction(0) as tx:
- with tx.with_prefix(b"__key__"):
- assert tx.get_window(start_ms=0, end_ms=10) == 1
+ assert tx.get_window(start_ms=0, end_ms=10, prefix=prefix) == 1
def test_get_window_doesnt_exist(self, windowed_rocksdb_store_factory):
store = windowed_rocksdb_store_factory()
store.assign_partition(0)
+ prefix = b"__key__"
with store.start_partition_transaction(0) as tx:
- with tx.with_prefix(b"__key__"):
- assert tx.get_window(start_ms=0, end_ms=10) is None
+ assert tx.get_window(start_ms=0, end_ms=10, prefix=prefix) is None
def test_delete_window(self, windowed_rocksdb_store_factory):
store = windowed_rocksdb_store_factory()
store.assign_partition(0)
+ prefix = b"__key__"
with store.start_partition_transaction(0) as tx:
- with tx.with_prefix(b"__key__"):
- tx.update_window(start_ms=0, end_ms=10, value=1, timestamp_ms=1)
- assert tx.get_window(start_ms=0, end_ms=10) == 1
- tx.delete_window(start_ms=0, end_ms=10)
+ tx.update_window(
+ start_ms=0, end_ms=10, value=1, timestamp_ms=1, prefix=prefix
+ )
+ assert tx.get_window(start_ms=0, end_ms=10, prefix=prefix) == 1
+ tx.delete_window(start_ms=0, end_ms=10, prefix=prefix)
with store.start_partition_transaction(0) as tx:
- with tx.with_prefix(b"__key__"):
- assert tx.get_window(start_ms=0, end_ms=10) is None
+ assert tx.get_window(start_ms=0, end_ms=10, prefix=prefix) is None
def test_expire_windows_expired(self, windowed_rocksdb_store_factory):
store = windowed_rocksdb_store_factory()
store.assign_partition(0)
+ prefix = b"__key__"
with store.start_partition_transaction(0) as tx:
- with tx.with_prefix(b"__key__"):
- tx.update_window(start_ms=0, end_ms=10, value=1, timestamp_ms=2)
- tx.update_window(start_ms=10, end_ms=20, value=2, timestamp_ms=10)
+ tx.update_window(
+ start_ms=0, end_ms=10, value=1, timestamp_ms=2, prefix=prefix
+ )
+ tx.update_window(
+ start_ms=10, end_ms=20, value=2, timestamp_ms=10, prefix=prefix
+ )
with store.start_partition_transaction(0) as tx:
- with tx.with_prefix(b"__key__"):
- tx.update_window(start_ms=20, end_ms=30, value=3, timestamp_ms=20)
- expired = tx.expire_windows(duration_ms=10)
- # "expire_windows" must update the expiration index so that the same
- # windows are not expired twice
- assert not tx.expire_windows(duration_ms=10)
+ tx.update_window(
+ start_ms=20, end_ms=30, value=3, timestamp_ms=20, prefix=prefix
+ )
+ expired = tx.expire_windows(duration_ms=10, prefix=prefix)
+ # "expire_windows" must update the expiration index so that the same
+ # windows are not expired twice
+ assert not tx.expire_windows(duration_ms=10, prefix=prefix)
assert len(expired) == 2
assert expired == [
@@ -69,10 +71,9 @@ def test_expire_windows_expired(self, windowed_rocksdb_store_factory):
]
with store.start_partition_transaction(0) as tx:
- with tx.with_prefix(b"__key__"):
- assert tx.get_window(start_ms=0, end_ms=10) is None
- assert tx.get_window(start_ms=10, end_ms=20) is None
- assert tx.get_window(start_ms=20, end_ms=30) == 3
+ assert tx.get_window(start_ms=0, end_ms=10, prefix=prefix) is None
+ assert tx.get_window(start_ms=10, end_ms=20, prefix=prefix) is None
+ assert tx.get_window(start_ms=20, end_ms=30, prefix=prefix) == 3
def test_expire_windows_cached(self, windowed_rocksdb_store_factory):
"""
@@ -81,52 +82,62 @@ def test_expire_windows_cached(self, windowed_rocksdb_store_factory):
"""
store = windowed_rocksdb_store_factory()
store.assign_partition(0)
+ prefix = b"__key__"
with store.start_partition_transaction(0) as tx:
- with tx.with_prefix(b"__key__"):
- tx.update_window(start_ms=0, end_ms=10, value=1, timestamp_ms=2)
- tx.update_window(start_ms=10, end_ms=20, value=2, timestamp_ms=10)
- tx.update_window(start_ms=20, end_ms=30, value=3, timestamp_ms=20)
- expired = tx.expire_windows(duration_ms=10)
- # "expire_windows" must update the expiration index so that the same
- # windows are not expired twice
- assert not tx.expire_windows(duration_ms=10)
-
- assert len(expired) == 2
- assert expired == [
- ((0, 10), 1),
- ((10, 20), 2),
- ]
-
- with store.start_partition_transaction(0) as tx:
- with tx.with_prefix(b"__key__"):
- assert tx.get_window(start_ms=0, end_ms=10) is None
- assert tx.get_window(start_ms=10, end_ms=20) is None
- assert tx.get_window(start_ms=20, end_ms=30) == 3
+ tx.update_window(
+ start_ms=0, end_ms=10, value=1, timestamp_ms=2, prefix=prefix
+ )
+ tx.update_window(
+ start_ms=10, end_ms=20, value=2, timestamp_ms=10, prefix=prefix
+ )
+ tx.update_window(
+ start_ms=20, end_ms=30, value=3, timestamp_ms=20, prefix=prefix
+ )
+ expired = tx.expire_windows(duration_ms=10, prefix=prefix)
+ # "expire_windows" must update the expiration index so that the same
+ # windows are not expired twice
+ assert not tx.expire_windows(duration_ms=10, prefix=prefix)
+ assert len(expired) == 2
+ assert expired == [
+ ((0, 10), 1),
+ ((10, 20), 2),
+ ]
+ assert tx.get_window(start_ms=0, end_ms=10, prefix=prefix) is None
+ assert tx.get_window(start_ms=10, end_ms=20, prefix=prefix) is None
+ assert tx.get_window(start_ms=20, end_ms=30, prefix=prefix) == 3
def test_expire_windows_empty(self, windowed_rocksdb_store_factory):
store = windowed_rocksdb_store_factory()
store.assign_partition(0)
+ prefix = b"__key__"
with store.start_partition_transaction(0) as tx:
- with tx.with_prefix(b"__key__"):
- tx.update_window(start_ms=0, end_ms=10, value=1, timestamp_ms=2)
- tx.update_window(start_ms=0, end_ms=10, value=1, timestamp_ms=2)
+ tx.update_window(
+ start_ms=0, end_ms=10, value=1, timestamp_ms=2, prefix=prefix
+ )
+ tx.update_window(
+ start_ms=0, end_ms=10, value=1, timestamp_ms=2, prefix=prefix
+ )
with store.start_partition_transaction(0) as tx:
- with tx.with_prefix(b"__key__"):
- tx.update_window(start_ms=3, end_ms=13, value=1, timestamp_ms=3)
- assert not tx.expire_windows(duration_ms=10)
+ tx.update_window(
+ start_ms=3, end_ms=13, value=1, timestamp_ms=3, prefix=prefix
+ )
+ assert not tx.expire_windows(duration_ms=10, prefix=prefix)
def test_expire_windows_with_grace_expired(self, windowed_rocksdb_store_factory):
store = windowed_rocksdb_store_factory()
store.assign_partition(0)
+ prefix = b"__key__"
with store.start_partition_transaction(0) as tx:
- with tx.with_prefix(b"__key__"):
- tx.update_window(start_ms=0, end_ms=10, value=1, timestamp_ms=2)
+ tx.update_window(
+ start_ms=0, end_ms=10, value=1, timestamp_ms=2, prefix=prefix
+ )
with store.start_partition_transaction(0) as tx:
- with tx.with_prefix(b"__key__"):
- tx.update_window(start_ms=15, end_ms=25, value=1, timestamp_ms=15)
- expired = tx.expire_windows(duration_ms=10, grace_ms=5)
+ tx.update_window(
+ start_ms=15, end_ms=25, value=1, timestamp_ms=15, prefix=prefix
+ )
+ expired = tx.expire_windows(duration_ms=10, grace_ms=5, prefix=prefix)
assert len(expired) == 1
assert expired == [((0, 10), 1)]
@@ -134,14 +145,17 @@ def test_expire_windows_with_grace_expired(self, windowed_rocksdb_store_factory)
def test_expire_windows_with_grace_empty(self, windowed_rocksdb_store_factory):
store = windowed_rocksdb_store_factory()
store.assign_partition(0)
+ prefix = b"__key__"
with store.start_partition_transaction(0) as tx:
- with tx.with_prefix(b"__key__"):
- tx.update_window(start_ms=0, end_ms=10, value=1, timestamp_ms=2)
+ tx.update_window(
+ start_ms=0, end_ms=10, value=1, timestamp_ms=2, prefix=prefix
+ )
with store.start_partition_transaction(0) as tx:
- with tx.with_prefix(b"__key__"):
- tx.update_window(start_ms=13, end_ms=23, value=1, timestamp_ms=13)
- expired = tx.expire_windows(duration_ms=10, grace_ms=5)
+ tx.update_window(
+ start_ms=13, end_ms=23, value=1, timestamp_ms=13, prefix=prefix
+ )
+ expired = tx.expire_windows(duration_ms=10, grace_ms=5, prefix=prefix)
assert not expired
@@ -157,9 +171,10 @@ def test_get_window_invalid_duration(
):
store = windowed_rocksdb_store_factory()
store.assign_partition(0)
+ prefix = b"__key__"
with store.start_partition_transaction(0) as tx:
with pytest.raises(ValueError, match="Invalid window duration"):
- tx.get_window(start_ms=start_ms, end_ms=end_ms)
+ tx.get_window(start_ms=start_ms, end_ms=end_ms, prefix=prefix)
@pytest.mark.parametrize(
"start_ms, end_ms",
@@ -173,10 +188,15 @@ def test_update_window_invalid_duration(
):
store = windowed_rocksdb_store_factory()
store.assign_partition(0)
+ prefix = b"__key__"
with store.start_partition_transaction(0) as tx:
with pytest.raises(ValueError, match="Invalid window duration"):
tx.update_window(
- start_ms=start_ms, end_ms=end_ms, value=1, timestamp_ms=1
+ start_ms=start_ms,
+ end_ms=end_ms,
+ value=1,
+ timestamp_ms=1,
+ prefix=prefix,
)
@pytest.mark.parametrize(
@@ -191,39 +211,50 @@ def test_delete_window_invalid_duration(
):
store = windowed_rocksdb_store_factory()
store.assign_partition(0)
+ prefix = b"__key__"
with store.start_partition_transaction(0) as tx:
with pytest.raises(ValueError, match="Invalid window duration"):
- tx.delete_window(start_ms=start_ms, end_ms=end_ms)
+ tx.delete_window(start_ms=start_ms, end_ms=end_ms, prefix=prefix)
def test_expire_windows_no_expired(self, windowed_rocksdb_store_factory):
store = windowed_rocksdb_store_factory()
store.assign_partition(0)
+ prefix = b"__key__"
with store.start_partition_transaction(0) as tx:
- with tx.with_prefix(b"__key__"):
- tx.update_window(start_ms=0, end_ms=10, value=1, timestamp_ms=2)
+ tx.update_window(
+ start_ms=0, end_ms=10, value=1, timestamp_ms=2, prefix=prefix
+ )
with store.start_partition_transaction(0) as tx:
- with tx.with_prefix(b"__key__"):
- tx.update_window(start_ms=1, end_ms=11, value=1, timestamp_ms=9)
- # "expire_windows" must update the expiration index so that the same
- # windows are not expired twice
- assert not tx.expire_windows(duration_ms=10)
+ tx.update_window(
+ start_ms=1, end_ms=11, value=1, timestamp_ms=9, prefix=prefix
+ )
+ # "expire_windows" must update the expiration index so that the same
+ # windows are not expired twice
+ assert not tx.expire_windows(duration_ms=10, prefix=prefix)
def test_expire_windows_multiple_windows(self, windowed_rocksdb_store_factory):
store = windowed_rocksdb_store_factory()
store.assign_partition(0)
+ prefix = b"__key__"
with store.start_partition_transaction(0) as tx:
- with tx.with_prefix(b"__key__"):
- tx.update_window(start_ms=0, end_ms=10, value=1, timestamp_ms=2)
- tx.update_window(start_ms=10, end_ms=20, value=1, timestamp_ms=11)
- tx.update_window(start_ms=20, end_ms=30, value=1, timestamp_ms=21)
+ tx.update_window(
+ start_ms=0, end_ms=10, value=1, timestamp_ms=2, prefix=prefix
+ )
+ tx.update_window(
+ start_ms=10, end_ms=20, value=1, timestamp_ms=11, prefix=prefix
+ )
+ tx.update_window(
+ start_ms=20, end_ms=30, value=1, timestamp_ms=21, prefix=prefix
+ )
with store.start_partition_transaction(0) as tx:
- with tx.with_prefix(b"__key__"):
- tx.update_window(start_ms=30, end_ms=40, value=1, timestamp_ms=31)
- # "expire_windows" must update the expiration index so that the same
- # windows are not expired twice
- expired = tx.expire_windows(duration_ms=10)
+ tx.update_window(
+ start_ms=30, end_ms=40, value=1, timestamp_ms=31, prefix=prefix
+ )
+ # "expire_windows" must update the expiration index so that the same
+ # windows are not expired twice
+ expired = tx.expire_windows(duration_ms=10, prefix=prefix)
assert len(expired) == 3
assert expired[0] == ((0, 10), 1)
@@ -248,8 +279,9 @@ def test_get_latest_timestamp_update(self, windowed_rocksdb_store_factory):
store = windowed_rocksdb_store_factory()
partition = store.assign_partition(0)
timestamp = 123
+ prefix = b"__key__"
with partition.begin() as tx:
- tx.update_window(0, 10, value=1, timestamp_ms=timestamp)
+ tx.update_window(0, 10, value=1, timestamp_ms=timestamp, prefix=prefix)
with partition.begin() as tx:
assert tx.get_latest_timestamp() == timestamp
@@ -258,8 +290,9 @@ def test_get_latest_timestamp_loaded_from_db(self, windowed_rocksdb_store_factor
store = windowed_rocksdb_store_factory()
partition = store.assign_partition(0)
timestamp = 123
+ prefix = b"__key__"
with partition.begin() as tx:
- tx.update_window(0, 10, value=1, timestamp_ms=timestamp)
+ tx.update_window(0, 10, value=1, timestamp_ms=timestamp, prefix=prefix)
store.revoke_partition(0)
partition = store.assign_partition(0)
@@ -272,267 +305,78 @@ def test_get_latest_timestamp_cannot_go_backwards(
store = windowed_rocksdb_store_factory()
partition = store.assign_partition(0)
timestamp = 9
+ prefix = b"__key__"
with partition.begin() as tx:
- tx.update_window(0, 10, value=1, timestamp_ms=timestamp)
- tx.update_window(0, 10, value=1, timestamp_ms=timestamp - 1)
+ tx.update_window(0, 10, value=1, timestamp_ms=timestamp, prefix=prefix)
+ tx.update_window(0, 10, value=1, timestamp_ms=timestamp - 1, prefix=prefix)
assert tx.get_latest_timestamp() == timestamp
with partition.begin() as tx:
assert tx.get_latest_timestamp() == timestamp
-
-class TestWindowedRocksDBPartitionTransactionChangelog:
- def test_update_window(self, windowed_rocksdb_store_factory_changelog):
- store = windowed_rocksdb_store_factory_changelog()
- partition_num = 0
- store_partition = store.assign_partition(partition_num)
- producer = store_partition._changelog_producer._producer
- key = b"__key__"
+ def test_update_window_and_prepare(
+ self, windowed_rocksdb_partition_factory, changelog_producer_mock
+ ):
+ prefix = b"__key__"
start_ms = 0
end_ms = 10
value = 1
+ processed_offset = 1
+
+ with windowed_rocksdb_partition_factory(
+ changelog_producer=changelog_producer_mock
+ ) as store_partition:
+ tx = store_partition.begin()
+ tx.update_window(
+ start_ms=start_ms,
+ end_ms=end_ms,
+ value=value,
+ timestamp_ms=2,
+ prefix=prefix,
+ )
+ tx.prepare(processed_offset=processed_offset)
+ assert tx.prepared
- with store.start_partition_transaction(partition_num) as tx:
- with tx.with_prefix(key):
- expected_produced_key = tx._serialize_key(
- encode_window_key(start_ms, end_ms)
- )
- expected_produced_value = tx._serialize_value(value)
- tx.update_window(
- start_ms=start_ms, end_ms=end_ms, value=value, timestamp_ms=2
- )
- assert tx.get_window(start_ms=start_ms, end_ms=end_ms) == value
-
- with store.start_partition_transaction(partition_num) as tx:
- with tx.with_prefix(key):
- assert tx.get_window(start_ms=start_ms, end_ms=end_ms) == value
-
- assert (
- store_partition.get_changelog_offset() == producer.produce.call_count == 1
+ assert changelog_producer_mock.produce.call_count == 1
+ expected_produced_key = tx._serialize_key(
+ encode_window_key(start_ms, end_ms), prefix=prefix
)
- producer.produce.assert_called_with(
+ expected_produced_value = tx._serialize_value(value)
+ changelog_producer_mock.produce.assert_called_with(
key=expected_produced_key,
value=expected_produced_value,
- headers={CHANGELOG_CF_MESSAGE_HEADER: "default"},
- topic=store_partition._changelog_producer._changelog_name,
- partition=store_partition._changelog_producer._partition_num,
+ headers={
+ CHANGELOG_CF_MESSAGE_HEADER: "default",
+ CHANGELOG_PROCESSED_OFFSET_MESSAGE_HEADER: dumps(processed_offset),
+ },
)
- def test_delete_window(self, windowed_rocksdb_store_factory_changelog):
- store = windowed_rocksdb_store_factory_changelog()
- partition_num = 0
- store_partition = store.assign_partition(partition_num)
- producer = store_partition._changelog_producer._producer
- key = b"__key__"
- expected_produced_value = None
+ def test_delete_window_and_prepare(
+ self, windowed_rocksdb_partition_factory, changelog_producer_mock
+ ):
+ prefix = b"__key__"
start_ms = 0
end_ms = 10
+ processed_offset = 1
- with store.start_partition_transaction(partition_num) as tx:
- with tx.with_prefix(key):
- expected_produced_key = tx._serialize_key(
- encode_window_key(start_ms, end_ms)
- )
- tx.update_window(
- start_ms=start_ms, end_ms=end_ms, value=1, timestamp_ms=1
- )
- assert tx.get_window(start_ms=start_ms, end_ms=end_ms) == 1
- tx.delete_window(start_ms=start_ms, end_ms=end_ms)
-
- with store.start_partition_transaction(partition_num) as tx:
- with tx.with_prefix(key):
- assert (
- tx.get_window(start_ms=start_ms, end_ms=end_ms)
- is expected_produced_value
- )
+ with windowed_rocksdb_partition_factory(
+ changelog_producer=changelog_producer_mock
+ ) as store_partition:
- assert (
- store_partition.get_changelog_offset() == producer.produce.call_count == 1
- )
- producer.produce.assert_called_with(
- key=expected_produced_key,
- value=expected_produced_value,
- headers={CHANGELOG_CF_MESSAGE_HEADER: "default"},
- topic=store_partition._changelog_producer._changelog_name,
- partition=store_partition._changelog_producer._partition_num,
- )
-
- def test_expire_windows_expired(self, windowed_rocksdb_store_factory_changelog):
- store = windowed_rocksdb_store_factory_changelog()
- partition_num = 0
- store_partition = store.assign_partition(partition_num)
- producer = store_partition._changelog_producer._producer
- key = b"__key__"
- expected_update_produce_keys = []
- expected_update_produce_values = []
- expected_expired_window_keys = []
- expected_expired_windows = [
- dict(start_ms=0, end_ms=10, value=1, timestamp_ms=2),
- dict(start_ms=10, end_ms=20, value=2, timestamp_ms=10),
- ]
-
- # update windows, which will become expired later
- with store.start_partition_transaction(partition_num) as tx:
- with tx.with_prefix(key):
- for kwargs in expected_expired_windows:
- serialized_key = tx._serialize_key(
- encode_window_key(
- start_ms=kwargs["start_ms"], end_ms=kwargs["end_ms"]
- )
- )
- expected_update_produce_keys.append(serialized_key)
- expected_expired_window_keys.append(serialized_key)
- expected_update_produce_values.append(
- tx._serialize_value(kwargs["value"])
- )
- tx.update_window(**kwargs)
-
- # add new window update, which expires previous windows
- with store.start_partition_transaction(partition_num) as tx:
- with tx.with_prefix(key):
- kwargs = dict(start_ms=20, end_ms=30, value=3, timestamp_ms=20)
- expected_update_produce_keys.append(
- tx._serialize_key(
- encode_window_key(
- start_ms=kwargs["start_ms"], end_ms=kwargs["end_ms"]
- )
- )
- )
- expected_update_produce_values.append(
- tx._serialize_value(kwargs["value"])
- )
- tx.update_window(**kwargs)
- expired = tx.expire_windows(duration_ms=10)
- print(expired)
- # "expire_windows" must update the expiration index so that the same
- # windows are not expired twice
- assert not tx.expire_windows(duration_ms=10)
-
- assert expired == [
- ((w["start_ms"], w["end_ms"]), w["value"]) for w in expected_expired_windows
- ]
+ tx = store_partition.begin()
+ tx.delete_window(start_ms=start_ms, end_ms=end_ms, prefix=prefix)
+ tx.prepare(processed_offset=processed_offset)
+ assert tx.prepared
- produce_calls = [
- call(
- key=k,
- value=v,
- headers={CHANGELOG_CF_MESSAGE_HEADER: "default"},
- topic=store_partition._changelog_producer._changelog_name,
- partition=store_partition._changelog_producer._partition_num,
- )
- for k, v in zip(
- expected_update_produce_keys, expected_update_produce_values
- )
- ]
-
- produce_calls.extend(
- [
- call(
- key=k,
- value=None,
- headers={CHANGELOG_CF_MESSAGE_HEADER: "default"},
- topic=store_partition._changelog_producer._changelog_name,
- partition=store_partition._changelog_producer._partition_num,
- )
- for k in expected_expired_window_keys
- ]
+ assert changelog_producer_mock.produce.call_count == 1
+ expected_produced_key = tx._serialize_key(
+ encode_window_key(start_ms, end_ms), prefix=prefix
)
-
- produce_calls.append(
- call(
- key=key + PREFIX_SEPARATOR + LATEST_EXPIRED_WINDOW_TIMESTAMP_KEY,
- value=str(expected_expired_windows[-1]["start_ms"]).encode(),
- headers={CHANGELOG_CF_MESSAGE_HEADER: LATEST_EXPIRED_WINDOW_CF_NAME},
- topic=store_partition._changelog_producer._changelog_name,
- partition=store_partition._changelog_producer._partition_num,
- )
- )
-
- producer.produce.assert_has_calls(produce_calls)
- assert producer.produce.call_count == len(produce_calls)
-
- with store.start_partition_transaction(0) as tx:
- with tx.with_prefix(b"__key__"):
- assert tx.get_window(start_ms=0, end_ms=10) is None
- assert tx.get_window(start_ms=10, end_ms=20) is None
- assert tx.get_window(start_ms=20, end_ms=30) == 3
-
- def test_expire_windows_cached(self, windowed_rocksdb_store_factory_changelog):
- """
- Check that windows expire correctly even if they're not committed to the DB
- yet.
-
- Consequently, only the end result of a window should be produced to the
- changelog topic, not every update.
- """
- store = windowed_rocksdb_store_factory_changelog()
- partition_num = 0
- store_partition = store.assign_partition(partition_num)
- producer = store_partition._changelog_producer._producer
- key = b"__key__"
- expected_update_produce_keys = []
- expected_update_produce_values = []
- update_windows = [
- dict(start_ms=0, end_ms=10, value=1, timestamp_ms=2),
- dict(start_ms=10, end_ms=20, value=2, timestamp_ms=10),
- dict(start_ms=20, end_ms=30, value=3, timestamp_ms=20),
- ]
- expected_expired_windows = update_windows[:2]
-
- with store.start_partition_transaction(0) as tx:
- with tx.with_prefix(b"__key__"):
- for kwargs in update_windows:
- serialized_key = tx._serialize_key(
- encode_window_key(
- start_ms=kwargs["start_ms"], end_ms=kwargs["end_ms"]
- )
- )
- tx.update_window(**kwargs)
- expected_update_produce_keys.append(serialized_key)
- if kwargs in expected_expired_windows:
- expected_update_produce_values.append(None)
- else:
- expected_update_produce_values.append(
- tx._serialize_value(kwargs["value"])
- )
-
- expired = tx.expire_windows(duration_ms=10)
- # "expire_windows" must update the expiration index so that the same
- # windows are not expired twice
- assert not tx.expire_windows(duration_ms=10)
-
- assert expired == [
- ((w["start_ms"], w["end_ms"]), w["value"]) for w in expected_expired_windows
- ]
-
- produce_calls = [
- call(
- key=k,
- value=v,
- headers={CHANGELOG_CF_MESSAGE_HEADER: "default"},
- topic=store_partition._changelog_producer._changelog_name,
- partition=store_partition._changelog_producer._partition_num,
- )
- for k, v in zip(
- expected_update_produce_keys, expected_update_produce_values
- )
- ]
-
- produce_calls.append(
- call(
- key=key + PREFIX_SEPARATOR + LATEST_EXPIRED_WINDOW_TIMESTAMP_KEY,
- value=str(expected_expired_windows[-1]["start_ms"]).encode(),
- headers={CHANGELOG_CF_MESSAGE_HEADER: LATEST_EXPIRED_WINDOW_CF_NAME},
- topic=store_partition._changelog_producer._changelog_name,
- partition=store_partition._changelog_producer._partition_num,
- )
+ changelog_producer_mock.produce.assert_called_with(
+ key=expected_produced_key,
+ value=None,
+ headers={
+ CHANGELOG_CF_MESSAGE_HEADER: "default",
+ CHANGELOG_PROCESSED_OFFSET_MESSAGE_HEADER: dumps(processed_offset),
+ },
)
-
- producer.produce.assert_has_calls(produce_calls)
- assert producer.produce.call_count == len(produce_calls)
-
- with store.start_partition_transaction(0) as tx:
- with tx.with_prefix(b"__key__"):
- assert tx.get_window(start_ms=0, end_ms=10) is None
- assert tx.get_window(start_ms=10, end_ms=20) is None
- assert tx.get_window(start_ms=20, end_ms=30) == 3
diff --git a/tests/test_quixstreams/utils.py b/tests/test_quixstreams/utils.py
deleted file mode 100644
index 645106a5a..000000000
--- a/tests/test_quixstreams/utils.py
+++ /dev/null
@@ -1,63 +0,0 @@
-from typing import Optional, List, Tuple, Union
-
-
-class ConfluentKafkaMessageStub:
- """
- A stub object to mock `confluent_kafka.Message`.
-
- Instances of `confluent_kafka.Message` cannot be directly created from Python,
- see https://github.com/confluentinc/confluent-kafka-python/issues/1535.
-
- """
-
- def __init__(
- self,
- topic: str = "test",
- partition: int = 0,
- offset: int = 0,
- timestamp: Tuple[int, int] = (1, 123),
- key: bytes = None,
- value: bytes = None,
- headers: Optional[List[Tuple[str, bytes]]] = None,
- latency: float = None,
- leader_epoch: int = None,
- ):
- self._topic = topic
- self._partition = partition
- self._offset = offset
- self._timestamp = timestamp
- self._key = key
- self._value = value
- self._headers = headers
- self._latency = latency
- self._leader_epoch = leader_epoch
-
- def headers(self, *args, **kwargs) -> Optional[List[Tuple[str, bytes]]]:
- return self._headers
-
- def key(self, *args, **kwargs) -> Optional[Union[str, bytes]]:
- return self._key
-
- def offset(self, *args, **kwargs) -> int:
- return self._offset
-
- def partition(self, *args, **kwargs) -> int:
- return self._partition
-
- def timestamp(self, *args, **kwargs) -> (int, int):
- return self._timestamp
-
- def topic(self, *args, **kwargs) -> str:
- return self._topic
-
- def value(self, *args, **kwargs) -> Optional[Union[str, bytes]]:
- return self._value
-
- def latency(self, *args, **kwargs) -> Optional[float]:
- return self._latency
-
- def leader_epoch(self, *args, **kwargs) -> Optional[int]:
- return self._leader_epoch
-
- def __len__(self) -> int:
- return len(self._value)
diff --git a/tests/utils.py b/tests/utils.py
index a1ceeb476..786dcb9ba 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -1,6 +1,6 @@
import dataclasses
import time
-
+from typing import Optional, List, Tuple, Union
from confluent_kafka import OFFSET_INVALID
DEFAULT_TIMEOUT = 10.0
@@ -31,3 +31,65 @@ class TopicPartitionStub:
topic: str
partition: int
offset: int = OFFSET_INVALID
+
+
+class ConfluentKafkaMessageStub:
+ """
+ A stub object to mock `confluent_kafka.Message`.
+
+ Instances of `confluent_kafka.Message` cannot be directly created from Python,
+ see https://github.com/confluentinc/confluent-kafka-python/issues/1535.
+
+ """
+
+ def __init__(
+ self,
+ topic: str = "test",
+ partition: int = 0,
+ offset: int = 0,
+ timestamp: Tuple[int, int] = (1, 123),
+ key: bytes = None,
+ value: bytes = None,
+ headers: Optional[List[Tuple[str, bytes]]] = None,
+ latency: float = None,
+ leader_epoch: int = None,
+ ):
+ self._topic = topic
+ self._partition = partition
+ self._offset = offset
+ self._timestamp = timestamp
+ self._key = key
+ self._value = value
+ self._headers = headers
+ self._latency = latency
+ self._leader_epoch = leader_epoch
+
+ def headers(self, *args, **kwargs) -> Optional[List[Tuple[str, bytes]]]:
+ return self._headers
+
+ def key(self, *args, **kwargs) -> Optional[Union[str, bytes]]:
+ return self._key
+
+ def offset(self, *args, **kwargs) -> int:
+ return self._offset
+
+ def partition(self, *args, **kwargs) -> int:
+ return self._partition
+
+ def timestamp(self, *args, **kwargs) -> (int, int):
+ return self._timestamp
+
+ def topic(self, *args, **kwargs) -> str:
+ return self._topic
+
+ def value(self, *args, **kwargs) -> Optional[Union[str, bytes]]:
+ return self._value
+
+ def latency(self, *args, **kwargs) -> Optional[float]:
+ return self._latency
+
+ def leader_epoch(self, *args, **kwargs) -> Optional[int]:
+ return self._leader_epoch
+
+ def __len__(self) -> int:
+ return len(self._value)