From f469cca5710ab693fd229f84c767d54fd00ab286 Mon Sep 17 00:00:00 2001 From: Daniil Gusev Date: Tue, 2 Apr 2024 21:19:54 +0200 Subject: [PATCH 01/28] Batch state updates in RocksDBPartitionTransaction Do not write updates to Writebatch but do it once in the end instead --- quixstreams/state/rocksdb/exceptions.py | 4 - quixstreams/state/rocksdb/partition.py | 5 +- quixstreams/state/rocksdb/transaction.py | 73 ++++++++++++------- .../state/rocksdb/windowed/transaction.py | 21 +++--- .../test_state/test_manager.py | 10 ++- .../test_rocksdb/test_transaction.py | 18 ++--- .../test_windowed/test_transaction.py | 14 ++-- 7 files changed, 79 insertions(+), 66 deletions(-) diff --git a/quixstreams/state/rocksdb/exceptions.py b/quixstreams/state/rocksdb/exceptions.py index 502a96336..063fa49fc 100644 --- a/quixstreams/state/rocksdb/exceptions.py +++ b/quixstreams/state/rocksdb/exceptions.py @@ -3,7 +3,6 @@ __all__ = ( "StateSerializationError", "StateTransactionError", - "NestedPrefixError", "ColumnFamilyDoesNotExist", "ColumnFamilyAlreadyExists", "ColumnFamilyHeaderMissing", @@ -19,9 +18,6 @@ class StateSerializationError(StateError): ... class StateTransactionError(StateError): ... -class NestedPrefixError(StateError): ... - - class ColumnFamilyDoesNotExist(StateError): ... diff --git a/quixstreams/state/rocksdb/partition.py b/quixstreams/state/rocksdb/partition.py index 800a48a1f..cf78cb45b 100644 --- a/quixstreams/state/rocksdb/partition.py +++ b/quixstreams/state/rocksdb/partition.py @@ -2,7 +2,7 @@ import time from typing import Any, Union, Optional, List, Dict -from rocksdict import WriteBatch, Rdict, ColumnFamily, AccessType +from rocksdict import WriteBatch, Rdict, ColumnFamily, AccessType, WriteOptions from quixstreams.models import ConfluentKafkaMessageProto from quixstreams.state.recovery import ChangelogProducer @@ -312,6 +312,9 @@ def _open_rocksdict(self) -> Rdict: options=options, access_type=AccessType.read_write(), ) + # write_opts = WriteOptions() + # write_opts.disable_wal = True + # rdict.set_write_options(write_opts) # Ensure metadata column family is created without defining it upfront try: rdict.get_column_family(METADATA_CF_NAME) diff --git a/quixstreams/state/rocksdb/transaction.py b/quixstreams/state/rocksdb/transaction.py index 52207f012..f6cdb9853 100644 --- a/quixstreams/state/rocksdb/transaction.py +++ b/quixstreams/state/rocksdb/transaction.py @@ -12,7 +12,6 @@ PartitionTransaction, ) from .exceptions import ( - NestedPrefixError, StateTransactionError, ) from .metadata import ( @@ -110,6 +109,7 @@ class RocksDBPartitionTransaction(PartitionTransaction): "_loads", "_state", ) + _prefix: bytes def __init__( self, @@ -124,7 +124,9 @@ def __init__( :param loads: a function to deserialize data from bytes. """ self._partition = partition - self._update_cache: Dict[str, Dict[bytes, Union[bytes, Undefined]]] = {} + self._update_cache: Dict[ + str, Dict[bytes, Dict[bytes, Union[bytes, Undefined]]] + ] = {"default": {}} self._batch = WriteBatch(raw_mode=True) self._prefix = _DEFAULT_PREFIX self._failed = False @@ -155,8 +157,6 @@ def with_prefix(self, prefix: Any = b"") -> Self: automatically between the key and the prefix if the prefix is not empty. """ - if self._prefix != _DEFAULT_PREFIX: - raise NestedPrefixError("The transaction already has a prefix") self._prefix = ( prefix if isinstance(prefix, bytes) else self._serialize_value(prefix) ) @@ -188,7 +188,11 @@ def get( # First, check the update cache in case the value was previously written # Use _undefined sentinel as default because the actual value can be "None" key_serialized = self._serialize_key(key) - cached = self._update_cache.get(cf_name, {}).get(key_serialized, _undefined) + cached = ( + self._update_cache.get(cf_name, {}) + .get(self._prefix, {}) + .get(key_serialized, _undefined) + ) if cached is _deleted: return default @@ -213,13 +217,10 @@ def set(self, key: Any, value: Any, cf_name: str = "default"): :param cf_name: rocksdb column family name. Default - "default" """ - key_serialized = self._serialize_key(key) - value_serialized = self._serialize_value(value) - try: - cf_handle = self._partition.get_column_family_handle(cf_name) - self._batch.put(key_serialized, value_serialized, cf_handle) - self._update_cache.setdefault(cf_name, {})[ + key_serialized = self._serialize_key(key) + value_serialized = self._serialize_value(value) + self._update_cache.setdefault(cf_name, {}).setdefault(self._prefix, {})[ key_serialized ] = value_serialized except Exception: @@ -236,14 +237,11 @@ def delete(self, key: Any, cf_name: str = "default"): :param key: key to delete from DB :param cf_name: rocksdb column family name. Default - "default" """ - key_serialized = self._serialize_key(key) try: - cf_handle = self._partition.get_column_family_handle(cf_name) - self._batch.delete(key_serialized, cf_handle) - - if cf_name not in self._update_cache: - self._update_cache[cf_name] = {} - self._update_cache[cf_name][key_serialized] = _deleted + key_serialized = self._serialize_key(key) + self._update_cache.setdefault(cf_name, {}).setdefault(self._prefix, {})[ + key_serialized + ] = _deleted except Exception: self._failed = True @@ -262,7 +260,11 @@ def exists(self, key: Any, cf_name: str = "default") -> bool: """ key_serialized = self._serialize_key(key) - cached = self._update_cache.get(cf_name, {}).get(key_serialized, _undefined) + cached = ( + self._update_cache.get(cf_name, {}) + .get(self._prefix, {}) + .get(key_serialized, _undefined) + ) if cached is _deleted: return False @@ -301,13 +303,16 @@ def _update_changelog(self, meta_cf_handle: ColumnFamily): logger.debug("Flushing state changes to the changelog topic...") offset = self._partition.get_changelog_offset() or 0 - for cf_name in self._update_cache: + for cf_name, cf_update_cache in self._update_cache.items(): headers = {CHANGELOG_CF_MESSAGE_HEADER: cf_name} - for k, v in self._update_cache[cf_name].items(): - self._partition.produce_to_changelog( - key=k, value=v if v is not _deleted else None, headers=headers - ) - offset += 1 + for _prefix, prefix_update_cache in cf_update_cache.items(): + for key, value in prefix_update_cache.items(): + self._partition.produce_to_changelog( + key=key, + value=value if value is not _deleted else None, + headers=headers, + ) + offset += 1 self._batch.put( CHANGELOG_OFFSET_KEY, int_to_int64_bytes(offset), meta_cf_handle @@ -331,15 +336,27 @@ def maybe_flush(self, offset: Optional[int] = None): :param offset: offset of the last processed message, optional. """ try: + meta_cf_handle = self._partition.get_column_family_handle(METADATA_CF_NAME) + for cf_name, cf_update_cache in self._update_cache.items(): + cf_handle = self._partition.get_column_family_handle(cf_name) + for _prefix, prefix_update_cache in cf_update_cache.items(): + for key, value in prefix_update_cache.items(): + if value is _deleted: + self._batch.delete(key, cf_handle) + else: + self._batch.put(key, value, cf_handle) + + # TODO: Maybe unify writebatch and changelog work here so we do only one pass + # through the update cache + # Don't write batches if this transaction doesn't change any keys if len(self._batch): - cf_handle = self._partition.get_column_family_handle(METADATA_CF_NAME) if offset is not None: self._batch.put( - PROCESSED_OFFSET_KEY, int_to_int64_bytes(offset), cf_handle + PROCESSED_OFFSET_KEY, int_to_int64_bytes(offset), meta_cf_handle ) if self._partition.using_changelogs: - self._update_changelog(cf_handle) + self._update_changelog(meta_cf_handle) self._partition.write(self._batch) except Exception: self._failed = True diff --git a/quixstreams/state/rocksdb/windowed/transaction.py b/quixstreams/state/rocksdb/windowed/transaction.py index d54eb717f..9e674442f 100644 --- a/quixstreams/state/rocksdb/windowed/transaction.py +++ b/quixstreams/state/rocksdb/windowed/transaction.py @@ -10,7 +10,7 @@ LATEST_TIMESTAMP_KEY, PREFIX_SEPARATOR, ) -from ..partition import RocksDBPartitionTransaction +from ..transaction import RocksDBPartitionTransaction, _deleted from ..serialization import int_to_int64_bytes, serialize from ..types import LoadsFunc, DumpsFunc @@ -172,18 +172,17 @@ def _get_windows( read_opt=read_opt, from_key=seek_from_key ): message_key, start, end = parse_window_key(key) + if start_from_ms < start <= start_to_ms: + windows[(start, end)] = self._deserialize_value(value) - if message_key != self._prefix or start > start_to_ms: - break - elif start <= start_from_ms: - continue - - windows[(start, end)] = self._deserialize_value(value) - - for window_key, window_value in self._update_cache.get("default", {}).items(): + for window_key, window_value in ( + self._update_cache["default"].get(self._prefix, {}).items() + ): message_key, start, end = parse_window_key(window_key) - if message_key != self._prefix or not start_from_ms < start <= start_to_ms: + if window_value is _deleted: + windows.pop((start, end), None) continue - windows[(start, end)] = self._deserialize_value(window_value) + elif start_from_ms < start <= start_to_ms: + windows[(start, end)] = self._deserialize_value(window_value) return sorted(windows.items()) diff --git a/tests/test_quixstreams/test_state/test_manager.py b/tests/test_quixstreams/test_state/test_manager.py index 9942c6f1f..67982780d 100644 --- a/tests/test_quixstreams/test_state/test_manager.py +++ b/tests/test_quixstreams/test_state/test_manager.py @@ -13,6 +13,7 @@ WindowedStoreAlreadyRegisteredError, ) from quixstreams.state.recovery import ChangelogProducerFactory +from quixstreams.state.rocksdb import RocksDBPartitionTransaction from tests.utils import TopicPartitionStub @@ -229,7 +230,9 @@ def test_store_transaction_no_flush_if_partition_transaction_failed( tx_store2 = state_manager.get_store_transaction("store2") # Simulate exception in one of the transactions with contextlib.suppress(ValueError), patch.object( - rocksdict.WriteBatch, "put", side_effect=ValueError("test") + RocksDBPartitionTransaction, + "_serialize_key", + side_effect=ValueError("test"), ): tx_store1.set("some_key", "some_value") tx_store2.set("some_key", "some_value") @@ -344,7 +347,6 @@ def test_store_transaction_no_flush_on_exception( consumer.get_watermark_offsets.return_value = (0, 10) topic_manager.topic(name="topic") - # topic_admin_mock.inspect_topics.return_value = {"topic": None} state_manager.register_store("topic", store_name="store") state_manager.on_partition_assign(TopicPartitionStub("topic", 0)) store = state_manager.get_store("topic", "store") @@ -388,7 +390,9 @@ def test_store_transaction_no_flush_if_partition_transaction_failed( tx_store2 = state_manager.get_store_transaction("store2") # Simulate exception in one of the transactions with contextlib.suppress(ValueError), patch.object( - rocksdict.WriteBatch, "put", side_effect=ValueError("test") + RocksDBPartitionTransaction, + "_serialize_key", + side_effect=ValueError("test"), ): tx_store1.set("some_key", "some_value") tx_store2.set("some_key", "some_value") diff --git a/tests/test_quixstreams/test_state/test_rocksdb/test_transaction.py b/tests/test_quixstreams/test_state/test_rocksdb/test_transaction.py index d2a5da1ac..2a87cbae9 100644 --- a/tests/test_quixstreams/test_state/test_rocksdb/test_transaction.py +++ b/tests/test_quixstreams/test_state/test_rocksdb/test_transaction.py @@ -10,8 +10,8 @@ StateSerializationError, StateTransactionError, RocksDBStorePartition, - NestedPrefixError, RocksDBOptions, + RocksDBPartitionTransaction, ) from quixstreams.state.rocksdb.metadata import ( CHANGELOG_CF_MESSAGE_HEADER, @@ -322,9 +322,14 @@ def test_update_key_failed_transaction_failed(self, operation, rocksdb_partition Test that if the update operation (set or delete) fails the transaction is marked as failed and cannot be re-used anymore. """ + # TODO: Test fails because writebatch is not used anymore during updates + # TODO: What's the point of this "failing?" - To not flush anything if one of transactions is incomplete on __exit__ + # Since now each update translates with patch.object( - rocksdict.WriteBatch, "put", side_effect=ValueError("test") - ), patch.object(rocksdict.WriteBatch, "delete", side_effect=ValueError("test")): + RocksDBPartitionTransaction, + "_serialize_key", + side_effect=ValueError("test"), + ): with rocksdb_partition.begin() as tx: with contextlib.suppress(ValueError): operation(tx=tx) @@ -390,13 +395,6 @@ def test_transaction_not_flushed_on_error(self, rocksdb_partition): with rocksdb_partition.begin() as tx: assert tx.get("key") is None - def test_nested_prefixes_fail(self, rocksdb_partition): - tx = rocksdb_partition.begin() - with pytest.raises(NestedPrefixError): - with tx.with_prefix("prefix"): - with tx.with_prefix("prefix"): - ... - def test_custom_dumps_loads(self, rocksdb_partition_factory): key = secrets.token_bytes(10) value = secrets.token_bytes(10) diff --git a/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/test_transaction.py b/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/test_transaction.py index 2791c4c2e..2cc87a62d 100644 --- a/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/test_transaction.py +++ b/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/test_transaction.py @@ -90,15 +90,11 @@ def test_expire_windows_cached(self, windowed_rocksdb_store_factory): # "expire_windows" must update the expiration index so that the same # windows are not expired twice assert not tx.expire_windows(duration_ms=10) - - assert len(expired) == 2 - assert expired == [ - ((0, 10), 1), - ((10, 20), 2), - ] - - with store.start_partition_transaction(0) as tx: - with tx.with_prefix(b"__key__"): + assert len(expired) == 2 + assert expired == [ + ((0, 10), 1), + ((10, 20), 2), + ] assert tx.get_window(start_ms=0, end_ms=10) is None assert tx.get_window(start_ms=10, end_ms=20) is None assert tx.get_window(start_ms=20, end_ms=30) == 3 From e7aeb57c3a4226380368c3ea80caafd8b78d0ef1 Mon Sep 17 00:00:00 2001 From: Daniil Gusev Date: Thu, 11 Apr 2024 16:48:37 +0200 Subject: [PATCH 02/28] New Checkpointing flow (part 1) - Added new Checkpoint class to sync state updates and Kafka commits - Updated state transactions to span across multiple offsets - Added new ProcessingContext class to share dependencies and checkpoints between Application and SDF --- quixstreams/app.py | 219 ++++---- quixstreams/checkpoint.py | 141 +++++ quixstreams/dataframe/dataframe.py | 65 ++- quixstreams/dataframe/windows/time_based.py | 34 +- quixstreams/processing_context.py | 75 +++ quixstreams/state/manager.py | 133 +---- quixstreams/state/rocksdb/transaction.py | 118 ++--- quixstreams/state/rocksdb/windowed/state.py | 17 +- .../state/rocksdb/windowed/transaction.py | 64 ++- quixstreams/state/state.py | 16 +- quixstreams/state/types.py | 143 ++++-- tests/test_quixstreams/fixtures.py | 2 + tests/test_quixstreams/test_app.py | 141 +++-- tests/test_quixstreams/test_checkpoint.py | 166 ++++++ .../test_dataframe/fixtures.py | 19 +- .../test_dataframe/test_dataframe.py | 119 +---- .../test_windows/test_hopping.py | 108 ++-- .../test_windows/test_tumbling.py | 110 ++-- .../test_state/test_manager.py | 153 ------ .../test_state/test_rocksdb/fixtures.py | 37 +- .../test_state/test_rocksdb/test_partition.py | 3 +- .../test_state/test_rocksdb/test_store.py | 5 +- .../test_rocksdb/test_transaction.py | 483 ++++++++++-------- .../test_windowed/test_partition.py | 23 +- .../test_rocksdb/test_windowed/test_state.py | 56 ++ .../test_windowed/test_transaction.py | 373 ++++++++------ 26 files changed, 1534 insertions(+), 1289 deletions(-) create mode 100644 quixstreams/checkpoint.py create mode 100644 quixstreams/processing_context.py create mode 100644 tests/test_quixstreams/test_checkpoint.py create mode 100644 tests/test_quixstreams/test_state/test_rocksdb/test_windowed/test_state.py diff --git a/quixstreams/app.py b/quixstreams/app.py index 9f608dc98..53f4bfe26 100644 --- a/quixstreams/app.py +++ b/quixstreams/app.py @@ -9,6 +9,7 @@ from confluent_kafka import TopicPartition from typing_extensions import Self +from .checkpoint import Checkpoint from .context import set_message_context, copy_context from .core.stream import Filtered from .dataframe import StreamingDataFrame @@ -40,6 +41,7 @@ check_state_management_enabled, QuixTopicManager, ) +from .processing_context import ProcessingContext from .rowconsumer import RowConsumer from .rowproducer import RowProducer from .state import StateStoreManager @@ -96,7 +98,7 @@ def __init__( quix_sdk_token: Optional[str] = None, consumer_group: Optional[str] = None, auto_offset_reset: AutoOffsetReset = "latest", - auto_commit_enable: bool = True, + commit_interval: float = 5.0, partitioner: Partitioner = "murmur2", consumer_extra_config: Optional[dict] = None, producer_extra_config: Optional[dict] = None, @@ -131,8 +133,7 @@ def __init__( Default - "quixstreams-default" (set during init) >***NOTE:*** Quix Applications will prefix it with the Quix workspace id. :param auto_offset_reset: Consumer `auto.offset.reset` setting - :param auto_commit_enable: If true, periodically commit offset of - the last message handed to the application. Default - `True`. + :param partitioner: A function to be used to determine the outgoing message partition. :param consumer_extra_config: A dictionary with additional options that @@ -213,7 +214,6 @@ def __init__( ) quix_configs = quix_config_builder.get_confluent_broker_config() # Check if the state dir points to the mounted PVC while running on Quix - # TODO: Do we still need this? check_state_dir(state_dir=state_dir) broker_address = quix_configs.pop("bootstrap.servers") @@ -230,8 +230,8 @@ def __init__( self._broker_address = broker_address self._consumer_group = consumer_group self._auto_offset_reset = auto_offset_reset - self._auto_commit_enable = auto_commit_enable self._partitioner = partitioner + self._commit_interval = commit_interval self._producer_extra_config = producer_extra_config self._consumer_extra_config = consumer_extra_config @@ -239,7 +239,7 @@ def __init__( broker_address=broker_address, consumer_group=consumer_group, auto_offset_reset=auto_offset_reset, - auto_commit_enable=auto_commit_enable, + auto_commit_enable=False, # Disable auto commit and manage commits manually assignment_strategy="cooperative-sticky", extra_config=consumer_extra_config, on_error=on_consumer_error, @@ -267,21 +267,11 @@ def __init__( ) ) self._topic_manager = topic_manager - self._state_manager = StateStoreManager( group_id=consumer_group, state_dir=state_dir, rocksdb_options=rocksdb_options, - producer=( - RowProducer( - broker_address=broker_address, - partitioner=partitioner, - extra_config=producer_extra_config, - on_error=on_producer_error, - ) - if use_changelog_topics - else None - ), + producer=self._producer if use_changelog_topics else None, recovery_manager=( RecoveryManager( consumer=self._consumer, @@ -291,13 +281,24 @@ def __init__( else None ), ) + self._checkpoint = Checkpoint( + producer=self._producer, + consumer=self._consumer, + state_manager=self._state_manager, + commit_interval=self._commit_interval, + ) + self._processing_context = ProcessingContext( + commit_interval=self._commit_interval, + producer=self._producer, + consumer=self._consumer, + state_manager=self._state_manager, + ) @classmethod def Quix( cls, consumer_group: Optional[str] = None, auto_offset_reset: AutoOffsetReset = "latest", - auto_commit_enable: bool = True, partitioner: Partitioner = "murmur2", consumer_extra_config: Optional[dict] = None, producer_extra_config: Optional[dict] = None, @@ -356,8 +357,6 @@ def Quix( Default - "quixstreams-default" (set during init). >***NOTE:*** Quix Applications will prefix it with the Quix workspace id. :param auto_offset_reset: Consumer `auto.offset.reset` setting - :param auto_commit_enable: If true, periodically commit offset of - the last message handed to the application. Default - `True`. :param partitioner: A function to be used to determine the outgoing message partition. :param consumer_extra_config: A dictionary with additional options that @@ -415,7 +414,6 @@ def Quix( consumer_extra_config=consumer_extra_config, producer_extra_config=producer_extra_config, auto_offset_reset=auto_offset_reset, - auto_commit_enable=auto_commit_enable, partitioner=partitioner, on_consumer_error=on_consumer_error, on_processing_error=on_processing_error, @@ -545,8 +543,9 @@ def dataframe( to be used as an input topic. :return: `StreamingDataFrame` object """ - sdf = StreamingDataFrame(topic=topic, state_manager=self._state_manager) - sdf.producer = self._producer + sdf = StreamingDataFrame( + topic=topic, processing_context=self._processing_context + ) return sdf def stop(self): @@ -594,7 +593,7 @@ def get_producer(self) -> Producer: extra_config=self._producer_extra_config, ) - def get_consumer(self) -> Consumer: + def get_consumer(self, auto_commit_enable: bool = True) -> Consumer: """ Create and return a pre-configured Consumer instance. The Consumer is initialized with params passed to Application. @@ -633,7 +632,7 @@ def get_consumer(self) -> Consumer: broker_address=self._broker_address, consumer_group=self._consumer_group, auto_offset_reset=self._auto_offset_reset, - auto_commit_enable=self._auto_commit_enable, + auto_commit_enable=auto_commit_enable, assignment_strategy="cooperative-sticky", extra_config=self._consumer_extra_config, ) @@ -644,78 +643,6 @@ def clear_state(self): """ self._state_manager.clear_stores() - def _quix_runtime_init(self): - """ - Do a runtime setup only applicable to an Application.Quix instance - - Ensure that "State management" flag is enabled for deployment if the app - is stateful and is running in Quix Cloud - """ - # Ensure that state management is enabled if application is stateful - if self._state_manager.stores: - check_state_management_enabled() - - def _setup_topics(self): - topics_list = ", ".join( - f'"{topic.name}"' for topic in self._topic_manager.all_topics - ) - logger.info(f"Topics required for this application: {topics_list}") - if self._auto_create_topics: - self._topic_manager.create_all_topics() - self._topic_manager.validate_all_topics() - - def _process_message(self, dataframe_composed, start_state_transaction): - # Serve producer callbacks - self._producer.poll(self._producer_poll_timeout) - rows = self._consumer.poll_row(timeout=self._consumer_poll_timeout) - - if rows is None: - return - - # Deserializer may return multiple rows for a single message - rows = rows if isinstance(rows, list) else [rows] - if not rows: - return - - first_row = rows[0] - topic_name, partition, offset = ( - first_row.topic, - first_row.partition, - first_row.offset, - ) - - with start_state_transaction( - topic=topic_name, partition=partition, offset=offset - ): - for row in rows: - context = copy_context() - context.run(set_message_context, first_row.context) - try: - # Execute StreamingDataFrame in a context - context.run(dataframe_composed, row.value) - except Filtered: - # The message was filtered by StreamingDataFrame - continue - except Exception as exc: - # TODO: This callback might be triggered because of Producer - # errors too because they happen within ".process()" - to_suppress = self._on_processing_error(exc, row, logger) - if not to_suppress: - raise - - # Store the message offset after it's successfully processed - self._consumer.store_offsets( - offsets=[ - TopicPartition( - topic=topic_name, - partition=partition, - offset=offset + 1, - ) - ] - ) - - if self._on_message_processed is not None: - self._on_message_processed(topic_name, partition, offset) - def run( self, dataframe: StreamingDataFrame, @@ -723,7 +650,7 @@ def run( """ Start processing data from Kafka using provided `StreamingDataFrame` - One started, can be safely terminated with a `SIGTERM` signal + Once started, it can be safely terminated with a `SIGTERM` signal (like Kubernetes does) or a typical `KeyboardInterrupt` (`Ctrl+C`). @@ -751,7 +678,8 @@ def run( f"Starting the Application with the config: " f'broker_address="{self._broker_address}" ' f'consumer_group="{self._consumer_group}" ' - f'auto_offset_reset="{self._auto_offset_reset}"' + f'auto_offset_reset="{self._auto_offset_reset}" ' + f"commit_interval={self._commit_interval}s" ) if self._is_quix_app: self._quix_runtime_init() @@ -768,14 +696,6 @@ def run( ) exit_stack.callback(lambda *_: self.stop()) - if self._state_manager.stores: - # Store manager has stores registered, use real state transactions - # during processing - start_state_transaction = self._state_manager.start_store_transaction - else: - # Application is stateless, use dummy state transactions - start_state_transaction = _dummy_state_transaction - with exit_stack: # Subscribe to topics in Kafka and start polling self._consumer.subscribe( @@ -788,16 +708,84 @@ def run( # Start polling Kafka for messages and callbacks self._running = True + # Initialize the checkpoint + self._processing_context.init_checkpoint() + dataframe_composed = dataframe.compose() while self._running: if self._state_manager.recovery_required: self._state_manager.do_recovery() else: - self._process_message(dataframe_composed, start_state_transaction) + self._process_message(dataframe_composed) + self._processing_context.commit_checkpoint() + self._processing_context.commit_checkpoint(force=True) logger.info("Stop processing of StreamingDataFrame") + def _quix_runtime_init(self): + """ + Do a runtime setup only applicable to an Application.Quix instance + - Ensure that "State management" flag is enabled for deployment if the app + is stateful and is running in Quix Cloud + """ + # Ensure that state management is enabled if application is stateful + if self._state_manager.stores: + check_state_management_enabled() + + def _setup_topics(self): + topics_list = ", ".join( + f'"{topic.name}"' for topic in self._topic_manager.all_topics + ) + logger.info(f"Topics required for this application: {topics_list}") + if self._auto_create_topics: + self._topic_manager.create_all_topics() + self._topic_manager.validate_all_topics() + + def _process_message(self, dataframe_composed): + # Serve producer callbacks + self._producer.poll(self._producer_poll_timeout) + rows = self._consumer.poll_row(timeout=self._consumer_poll_timeout) + + if rows is None: + return + + # Deserializer may return multiple rows for a single message + rows = rows if isinstance(rows, list) else [rows] + if not rows: + return + + first_row = rows[0] + topic_name, partition, offset = ( + first_row.topic, + first_row.partition, + first_row.offset, + ) + + for row in rows: + context = copy_context() + context.run(set_message_context, row.context) + try: + # Execute StreamingDataFrame in a context + context.run(dataframe_composed, row.value) + except Filtered: + # The message was filtered by StreamingDataFrame + continue + except Exception as exc: + # TODO: This callback might be triggered because of Producer + # errors too because they happen within ".process()" + to_suppress = self._on_processing_error(exc, row, logger) + if not to_suppress: + raise + + # Store the message offset after it's successfully processed + self._processing_context.store_offset( + topic=topic_name, partition=partition, offset=offset + ) + + if self._on_message_processed is not None: + self._on_message_processed(topic_name, partition, offset) + def _on_assign(self, _, topic_partitions: List[TopicPartition]): """ Assign new topic partitions to consumer and state. @@ -807,6 +795,11 @@ def _on_assign(self, _, topic_partitions: List[TopicPartition]): # sometimes "empty" calls happen, probably updating the consumer epoch if not topic_partitions: return + + # First commit everything processed so far because assignment can take a while + # and fail + self._processing_context.commit_checkpoint(force=True) + # assigning manually here (instead of allowing it handle it automatically) # enables pausing them during recovery to work as expected self._consumer.incremental_assign(topic_partitions) @@ -843,6 +836,9 @@ def _on_revoke(self, _, topic_partitions: List[TopicPartition]): """ Revoke partitions from consumer and state """ + # Commit everything processed so far + self._processing_context.commit_checkpoint(force=True) + self._consumer.incremental_unassign(topic_partitions) if self._state_manager.stores: logger.debug(f"Rebalancing: revoking state store partitions") @@ -872,10 +868,3 @@ def _on_sigint(self, *_): def _on_sigterm(self, *_): logger.debug(f"Received SIGTERM, stopping the processing loop") self.stop() - - -_nullcontext = contextlib.nullcontext() - - -def _dummy_state_transaction(topic: str, partition: int, offset: int): - return _nullcontext diff --git a/quixstreams/checkpoint.py b/quixstreams/checkpoint.py new file mode 100644 index 000000000..baaa98df5 --- /dev/null +++ b/quixstreams/checkpoint.py @@ -0,0 +1,141 @@ +import logging +import time +from typing import Dict, Tuple + +from confluent_kafka import TopicPartition + +from quixstreams.kafka import Consumer, Producer +from quixstreams.state import ( + StateStoreManager, + PartitionTransaction, + DEFAULT_STATE_STORE_NAME, +) + +logger = logging.getLogger(__name__) + + +# TODO: Tests +class Checkpoint: + """ + Class to keep track of state updates and consumer offsets and to checkpoint these + updates on schedule. + """ + + def __init__( + self, + commit_interval: float, + producer: Producer, + consumer: Consumer, + state_manager: StateStoreManager, + ): + self._created_at = time.monotonic() + self._tp_offsets: Dict[Tuple[str, int], int] = {} + + # A mapping of <(topic, partition, store_name): PartitionTransaction> + self._store_transactions: Dict[(str, int, str), PartitionTransaction] = {} + # Ensure the checkpoint is not negative. + # Passing zero or lower will flush the checkpoint after each processed message + self._commit_interval = max(commit_interval, 0) + self._state_manager = state_manager + self._consumer = consumer + self._producer = producer + + # TODO: Can the checkpoint object be reused? + # Do we need to validate that it can't? + + def expired(self) -> bool: + """ + Returns `True` if checkpoint deadline has expired. + """ + return (time.monotonic() - self._commit_interval) >= self._created_at + + def empty(self) -> bool: + """ + Returns `True` if checkpoint doesn't have any offsets stored yet. + :return: + """ + return not bool(self._tp_offsets) + + def store_offset(self, topic: str, partition: int, offset: int): + """ + Store the offset of the processed message to the checkpoint. + + :param topic: topic name + :param partition: partition number + :param offset: message offset + """ + self._tp_offsets[(topic, partition)] = offset + + def get_store_transaction( + self, topic: str, partition: int, store_name: str = DEFAULT_STATE_STORE_NAME + ) -> PartitionTransaction: + """ + Get a PartitionTransaction for the given store, topic and partition. + + It will return already started transaction if there's one. + + :param topic: topic name + :param partition: partition number + :param store_name: store name + :return: instance of `PartitionTransaction` + """ + transaction = self._store_transactions.get((topic, partition, store_name)) + if transaction is not None: + return transaction + + store = self._state_manager.get_store(topic=topic, store_name=store_name) + transaction = store.start_partition_transaction(partition=partition) + + self._store_transactions[(topic, partition, store_name)] = transaction + return transaction + + def commit(self): + """ + Commit the checkpoint. + + This method will: + 1. Flush the changelogs for each state store and ensure everything is produced. + 2. Commit topic offsets. + 3. Flush each state store partition to the disk. + + """ + # TODO: Error handling + + # 0. Produce the changelogs + # for ( + # topic, + # partition, + # store_name, + # ), transaction in self._store_transactions.items(): + # offset = self._tp_offsets[(topic, partition)] + # # TODO: Flush the changelogs. Call it "prepare"? + # if transaction.failed: + # raise + # transaction.prepare(offset=offset) + + # 1. Flush producer + # TODO: Check if all messages are flushed successfully + # TODO: Take the produced changelog offsets + # TODO: Logs + self._producer.flush() + + # 2. Commit offsets to Kafka + offsets = [ + TopicPartition(topic=topic, partition=partition, offset=offset + 1) + for (topic, partition), offset in self._tp_offsets.items() + ] + if offsets: + self._consumer.commit(offsets=offsets, asynchronous=False) + + # 3. Flush state store partitions to the disk + for ( + topic, + partition, + store_name, + ), transaction in self._store_transactions.items(): + offset = self._tp_offsets.get((topic, partition)) + if offset is not None: + transaction.maybe_flush(offset=offset) + + # TODO: Remove when the new changelog producer is implemented + self._producer.flush() diff --git a/quixstreams/dataframe/dataframe.py b/quixstreams/dataframe/dataframe.py index 5fdbf2be2..f036cae54 100644 --- a/quixstreams/dataframe/dataframe.py +++ b/quixstreams/dataframe/dataframe.py @@ -11,12 +11,11 @@ from quixstreams.context import ( message_context, set_message_context, - message_key, ) from quixstreams.core.stream import StreamCallable, Stream from quixstreams.models import Topic, Row, MessageContext -from quixstreams.rowproducer import RowProducerProto -from quixstreams.state import StateStoreManager, State +from quixstreams.processing_context import ProcessingContext +from quixstreams.state import State from .base import BaseStreaming from .exceptions import InvalidOperation from .series import StreamingSeries @@ -79,13 +78,17 @@ class StreamingDataFrame(BaseStreaming): def __init__( self, topic: Topic, - state_manager: StateStoreManager, + processing_context: ProcessingContext, stream: Optional[Stream] = None, ): self._stream: Stream = stream or Stream() self._topic = topic - self._real_producer: Optional[RowProducerProto] = None - self._state_manager = state_manager + self._processing_context = processing_context + self._producer = processing_context.producer + + @property + def processing_context(self) -> ProcessingContext: + return self._processing_context @property def stream(self) -> Stream: @@ -95,10 +98,6 @@ def stream(self) -> Stream: def topic(self) -> Topic: return self._topic - @property - def state_manager(self) -> StateStoreManager: - return self._state_manager - def __bool__(self): raise InvalidOperation( f"Cannot assess truth level of a {self.__class__.__name__} " @@ -144,7 +143,7 @@ def func(d: dict, state: State): """ if stateful: self._register_store() - func = _as_stateful(func=func, state_manager=self._state_manager) + func = _as_stateful(func=func, processing_context=self._processing_context) stream = self.stream.add_apply(func, expand=expand) return self._clone(stream=stream) @@ -183,7 +182,7 @@ def func(values: list, state: State): """ if stateful: self._register_store() - func = _as_stateful(func=func, state_manager=self._state_manager) + func = _as_stateful(func=func, processing_context=self._processing_context) stream = self.stream.add_update(func) return self._clone(stream=stream) @@ -225,21 +224,11 @@ def func(d: dict, state: State): if stateful: self._register_store() - func = _as_stateful(func=func, state_manager=self._state_manager) + func = _as_stateful(func=func, processing_context=self._processing_context) stream = self.stream.add_filter(func) return self._clone(stream=stream) - @property - def producer(self) -> RowProducerProto: - if self._real_producer is None: - raise RuntimeError("Producer instance has not been provided") - return self._real_producer - - @producer.setter - def producer(self, producer: RowProducerProto): - self._real_producer = producer - @staticmethod def contains(key: str) -> StreamingSeries: """ @@ -519,23 +508,25 @@ def hopping_window( def _clone(self, stream: Stream) -> Self: clone = self.__class__( - stream=stream, topic=self._topic, state_manager=self._state_manager + stream=stream, + topic=self._topic, + processing_context=self._processing_context, ) - if self._real_producer is not None: - clone.producer = self._real_producer return clone def _produce(self, topic: Topic, value: object, key: Optional[object] = None): ctx = message_context() key = key or ctx.key row = Row(value=value, context=ctx) # noqa - self.producer.produce_row(row, topic, key=key) + self._producer.produce_row(row, topic, key=key) def _register_store(self): """ Register the default store for input topic in StateStoreManager """ - self._state_manager.register_store(topic_name=self._topic.name) + self._processing_context.state_manager.register_store( + topic_name=self._topic.name + ) def __setitem__(self, key, value: Union[Self, object]): if isinstance(value, self.__class__): @@ -579,22 +570,24 @@ def __getitem__( # Take only certain keys from the dict and return a new dict return self.apply(lambda v: {k: v[k] for k in item}) elif isinstance(item, str): - # Create a StreamingSeries based on key + # Create a StreamingSeries based on a column name return StreamingSeries(name=item) else: raise TypeError(f'Unsupported key type "{type(item)}"') def _as_stateful( - func: DataFrameStatefulFunc, state_manager: StateStoreManager + func: DataFrameStatefulFunc, processing_context: ProcessingContext ) -> DataFrameFunc: @functools.wraps(func) def wrapper(value: object) -> object: - transaction = state_manager.get_store_transaction() - key = message_key() - # Prefix all the state keys by the message key - with transaction.with_prefix(prefix=key): - # Pass a State object with an interface limited to the key updates only - return func(value, transaction.state) + ctx = message_context() + transaction = processing_context.checkpoint.get_store_transaction( + topic=ctx.topic, partition=ctx.partition + ) + # Pass a State object with an interface limited to the key updates only + # and prefix all the state keys by the message key + state = transaction.as_state(prefix=ctx.key) + return func(value, state) return wrapper diff --git a/quixstreams/dataframe/windows/time_based.py b/quixstreams/dataframe/windows/time_based.py index 609483f4c..1e0f02c63 100644 --- a/quixstreams/dataframe/windows/time_based.py +++ b/quixstreams/dataframe/windows/time_based.py @@ -1,16 +1,10 @@ -import logging import functools +import logging from typing import Any, Optional, List, TYPE_CHECKING, cast, Tuple -from quixstreams.context import ( - message_context, - message_key, -) -from quixstreams.state import ( - StateStoreManager, - WindowedPartitionTransaction, - WindowedState, -) +from quixstreams.context import message_context +from quixstreams.processing_context import ProcessingContext +from quixstreams.state import WindowedPartitionTransaction, WindowedState from .base import ( WindowedDataFrameFunc, WindowAggregateFunc, @@ -161,7 +155,7 @@ def current(self, expand: bool = True) -> "StreamingDataFrame": ) def register_store(self): - self._dataframe.state_manager.register_windowed_store( + self._dataframe.processing_context.state_manager.register_windowed_store( topic_name=self._dataframe.topic.name, store_name=self._name ) @@ -177,7 +171,9 @@ def _apply_window( self.register_store() func = _as_windowed( - func=func, state_manager=self._dataframe.state_manager, store_name=name + func=func, + processing_context=self._dataframe.processing_context, + store_name=name, ) return self._dataframe.apply(func=func, expand=expand) @@ -194,23 +190,25 @@ def _noop() -> Any: def _as_windowed( - func: WindowedDataFrameFunc, state_manager: StateStoreManager, store_name: str + func: WindowedDataFrameFunc, processing_context: ProcessingContext, store_name: str ) -> "DataFrameFunc": @functools.wraps(func) def wrapper(value: object) -> object: + ctx = message_context() + key = ctx.key transaction = cast( WindowedPartitionTransaction, - state_manager.get_store_transaction(store_name=store_name), + processing_context.checkpoint.get_store_transaction( + topic=ctx.topic, partition=ctx.partition, store_name=store_name + ), ) - key = message_key() if key is None: - ctx = message_context() logger.warning( f"Skipping window processing for a message because the key is None, " f"partition='{ctx.topic}[{ctx.partition}]' offset='{ctx.offset}'." ) return _noop() - with transaction.with_prefix(prefix=key): - return func(value, transaction.state) + state = transaction.as_state(prefix=key) + return func(value, state) return wrapper diff --git a/quixstreams/processing_context.py b/quixstreams/processing_context.py new file mode 100644 index 000000000..aae237309 --- /dev/null +++ b/quixstreams/processing_context.py @@ -0,0 +1,75 @@ +import dataclasses +import logging +from typing import Optional + +from quixstreams.checkpoint import Checkpoint +from quixstreams.exceptions import QuixException +from quixstreams.rowconsumer import RowConsumer +from quixstreams.rowproducer import RowProducer +from quixstreams.state import StateStoreManager + +__all__ = ("ProcessingContext",) + +logger = logging.getLogger(__name__) + + +class CheckpointNotInitialized(QuixException): ... + + +@dataclasses.dataclass +class ProcessingContext: + """ + A class to share processing-related objects + between `Application` and `StreamingDataFrame` instances. + """ + + commit_interval: float + producer: RowProducer + consumer: RowConsumer + state_manager: StateStoreManager + _checkpoint: Optional[Checkpoint] = dataclasses.field( + init=False, repr=False, default=None + ) + + @property + def checkpoint(self) -> Checkpoint: + if self._checkpoint is None: + raise CheckpointNotInitialized("Checkpoint has not been initialized yet") + return self._checkpoint + + def store_offset(self, topic: str, partition: int, offset: int): + """ + Store the offset of the processed message to the checkpoint. + + :param topic: topic name + :param partition: partition number + :param offset: message offset + """ + self._checkpoint.store_offset(topic=topic, partition=partition, offset=offset) + + def init_checkpoint(self): + """ + Initialize a new checkpoint + """ + self._checkpoint = Checkpoint( + commit_interval=self.commit_interval, + state_manager=self.state_manager, + producer=self.producer, + consumer=self.consumer, + ) + + def commit_checkpoint(self, force: bool = False): + """ + Commit the current checkpoint. + + The actual commit will happen only when: + + 1. The checkpoint has at least one stored offset + 2. The checkpoint is expired or `force=True` is passed + + :param force: if `True`, commit the checkpoint before its expiration deadline. + """ + if not self._checkpoint.empty() and (self._checkpoint.expired() or force): + logger.info(f"Committing a checkpoint force={force}") + self._checkpoint.commit() + self.init_checkpoint() diff --git a/quixstreams/state/manager.py b/quixstreams/state/manager.py index d83190a8e..0db79d54c 100644 --- a/quixstreams/state/manager.py +++ b/quixstreams/state/manager.py @@ -1,31 +1,25 @@ -import contextlib import logging import shutil from pathlib import Path -from typing import List, Dict, Optional, Iterator +from typing import List, Dict, Optional from quixstreams.rowproducer import RowProducer from quixstreams.types import TopicPartition from .exceptions import ( StoreNotRegisteredError, - InvalidStoreTransactionStateError, PartitionStoreIsUsed, WindowedStoreAlreadyRegisteredError, ) from .recovery import RecoveryManager, ChangelogProducerFactory from .rocksdb import RocksDBStore, RocksDBOptionsType from .rocksdb.windowed.store import WindowedRocksDBStore -from .types import ( - Store, - PartitionTransaction, - StorePartition, -) +from .types import Store, StorePartition -__all__ = ("StateStoreManager",) +__all__ = ("StateStoreManager", "DEFAULT_STATE_STORE_NAME") logger = logging.getLogger(__name__) -_DEFAULT_STATE_STORE_NAME = "default" +DEFAULT_STATE_STORE_NAME = "default" class StateStoreManager: @@ -52,7 +46,6 @@ def __init__( self._stores: Dict[str, Dict[str, Store]] = {} self._producer = producer self._recovery_manager = recovery_manager - self._transaction: Optional[_MultiStoreTransaction] = None def _init_state_dir(self): logger.info(f'Initializing state directory at "{self._state_dir}"') @@ -106,7 +99,7 @@ def stop_recovery(self): return self._recovery_manager.stop_recovery() def get_store( - self, topic: str, store_name: str = _DEFAULT_STATE_STORE_NAME + self, topic: str, store_name: str = DEFAULT_STATE_STORE_NAME ) -> Store: """ Get a store for given name and topic @@ -139,7 +132,7 @@ def _setup_changelogs( ) def register_store( - self, topic_name: str, store_name: str = _DEFAULT_STATE_STORE_NAME + self, topic_name: str, store_name: str = DEFAULT_STATE_STORE_NAME ): """ Register a state store to be managed by StateStoreManager. @@ -256,123 +249,9 @@ def close(self): for store in topic_stores.values(): store.close() - def get_store_transaction( - self, store_name: str = _DEFAULT_STATE_STORE_NAME - ) -> PartitionTransaction: - """ - Get active `PartitionTransaction` for the store - :param store_name: - :return: - """ - if self._transaction is None: - raise InvalidStoreTransactionStateError( - "Store transaction is not started yet" - ) - return self._transaction.get_store_transaction(store_name=store_name) - - @contextlib.contextmanager - def start_store_transaction( - self, topic: str, partition: int, offset: int - ) -> Iterator["_MultiStoreTransaction"]: - """ - Starting the multi-store transaction for the Kafka message. - - This transaction will keep track of all used stores and flush them in the end. - If any exception is caught during this transaction, none of them - will be flushed as a best effort to keep stores consistent in "at-least-once" setting. - - There can be only one active transaction at a time. Starting a new transaction - before the end of the current one will fail. - - - :param topic: message topic - :param partition: message partition - :param offset: message offset - """ - if not self._stores.get(topic): - raise StoreNotRegisteredError( - f'Topic "{topic}" does not have stores registered' - ) - - if self._transaction is not None: - raise InvalidStoreTransactionStateError( - "Another transaction is already in progress" - ) - self._transaction = _MultiStoreTransaction( - manager=self, topic=topic, partition=partition, offset=offset - ) - try: - yield self._transaction - self._transaction.flush() - finally: - self._transaction = None - def __enter__(self): self.init() return self def __exit__(self, exc_type, exc_val, exc_tb): self.close() - - -class _MultiStoreTransaction: - """ - A transaction-like class to manage flushing of multiple state partitions for each - processed message. - - It is responsible for: - - Keeping track of actual DBTransactions for the individual stores - - Flushing of the opened transactions in the end - - """ - - def __init__( - self, manager: "StateStoreManager", topic: str, partition: int, offset: int - ): - self._manager = manager - self._transactions: Dict[str, PartitionTransaction] = {} - self._topic = topic - self._partition = partition - self._offset = offset - - def get_store_transaction( - self, store_name: str = _DEFAULT_STATE_STORE_NAME - ) -> PartitionTransaction: - """ - Get a PartitionTransaction for the given store - - It will return already started transaction if there's one. - - :param store_name: store name - :return: instance of `PartitionTransaction` - """ - transaction = self._transactions.get(store_name) - if transaction is not None: - return transaction - - store = self._manager.get_store(topic=self._topic, store_name=store_name) - transaction = store.start_partition_transaction(partition=self._partition) - self._transactions[store_name] = transaction - return transaction - - def flush(self): - """ - Flush all `PartitionTransaction` instances for each registered store and - save the last processed offset for each partition. - - Empty transactions without any updates will not be flushed. - - If there are any failed transactions, no transactions will be flushed - to keep the stores consistent. - """ - for store_name, transaction in self._transactions.items(): - if transaction.failed: - logger.warning( - f'Detected failed transaction for store "{store_name}" ' - f'(topic "{self._topic}" partition "{self._partition}" ' - f'offset "{self._offset}), state transactions will not be flushed"' - ) - return - - for transaction in self._transactions.values(): - transaction.maybe_flush(offset=self._offset) diff --git a/quixstreams/state/rocksdb/transaction.py b/quixstreams/state/rocksdb/transaction.py index f6cdb9853..e1ee5db20 100644 --- a/quixstreams/state/rocksdb/transaction.py +++ b/quixstreams/state/rocksdb/transaction.py @@ -1,10 +1,8 @@ -import contextlib import functools import logging from typing import Any, Union, Optional, Dict, NewType, TYPE_CHECKING from rocksdict import WriteBatch, ColumnFamily -from typing_extensions import Self from quixstreams.state.types import ( DumpsFunc, @@ -33,16 +31,14 @@ logger = logging.getLogger(__name__) - Undefined = NewType("Undefined", object) -_undefined = Undefined(object()) -_deleted = Undefined(object()) - -_DEFAULT_PREFIX = b"" +UNDEFINED = Undefined(object()) +DELETED = Undefined(object()) +DEFAULT_PREFIX = b"" -__all__ = ("RocksDBPartitionTransaction",) +__all__ = ("RocksDBPartitionTransaction", "DEFAULT_PREFIX", "DELETED") def _validate_transaction_state(func): @@ -78,10 +74,9 @@ class RocksDBPartitionTransaction(PartitionTransaction): Prefixing ********* - `RocksDBTransaction` allows to set prefixes for the keys in the given code block - using :meth:`with_prefix()` context manager. - Normally, `StreamingDataFrame` class will use message keys as prefixes - in order to namespace the stored keys across different messages. + Methods `get()`, `set()`, `delete()` and `exists()` methods require prefixes for + the keys. + Normally, the Kafka message keys are supposed to be used as prefixes. Transactional properties ************************ @@ -102,14 +97,11 @@ class RocksDBPartitionTransaction(PartitionTransaction): "_partition", "_update_cache", "_batch", - "_prefix", "_failed", "_completed", "_dumps", "_loads", - "_state", ) - _prefix: bytes def __init__( self, @@ -128,47 +120,33 @@ def __init__( str, Dict[bytes, Dict[bytes, Union[bytes, Undefined]]] ] = {"default": {}} self._batch = WriteBatch(raw_mode=True) - self._prefix = _DEFAULT_PREFIX self._failed = False self._completed = False self._dumps = dumps self._loads = loads - self._state = TransactionState(transaction=self) - - @property - def state(self) -> TransactionState: - return self._state - @contextlib.contextmanager - def with_prefix(self, prefix: Any = b"") -> Self: + def as_state(self, prefix: Any = DEFAULT_PREFIX) -> TransactionState: """ - A context manager set the prefix for all keys in the scope. + Create a one-time `TransactionState` object with a limited CRUD interface. - Normally, it's called by Streaming DataFrames engine to ensure that every - message key is stored separately. + The `TransactionState` will prefix all the keys with the supplied `prefix` + for all underlying operations. - The `with_prefix` calls should not be nested. - Only one prefix can be set at a time. - - :param prefix: a prefix string to be used. - Should be either `bytes` or object serializable to `bytes` - by `dumps` function. - The prefix doesn't need to contain the separator, it will be added - automatically between the key and the prefix if the prefix - is not empty. + :param prefix: a prefix to be used for all keys + :return: """ - self._prefix = ( - prefix if isinstance(prefix, bytes) else self._serialize_value(prefix) + return TransactionState( + transaction=self, + prefix=( + prefix + if isinstance(prefix, bytes) + else serialize(prefix, dumps=self._dumps) + ), ) - try: - yield self - finally: - self._prefix = _DEFAULT_PREFIX - @_validate_transaction_state def get( - self, key: Any, default: Any = None, cf_name: str = "default" + self, key: Any, prefix: bytes, default: Any = None, cf_name: str = "default" ) -> Optional[Any]: """ Get a key from the store. @@ -179,6 +157,7 @@ def get( It returns `None` if the key is not found and `default` is not provided. :param key: a key to get from DB + :param prefix: a key prefix :param default: value to return if the key is not present in the state. It can be of any type. :param cf_name: rocksdb column family name. Default - "default" @@ -187,40 +166,41 @@ def get( # First, check the update cache in case the value was previously written # Use _undefined sentinel as default because the actual value can be "None" - key_serialized = self._serialize_key(key) + key_serialized = self._serialize_key(key, prefix=prefix) cached = ( self._update_cache.get(cf_name, {}) - .get(self._prefix, {}) - .get(key_serialized, _undefined) + .get(prefix, {}) + .get(key_serialized, UNDEFINED) ) - if cached is _deleted: + if cached is DELETED: return default - if cached is not _undefined: + if cached is not UNDEFINED: return self._deserialize_value(cached) # The value is not found in cache, check the db - stored = self._partition.get(key_serialized, _undefined, cf_name=cf_name) - if stored is not _undefined: + stored = self._partition.get(key_serialized, UNDEFINED, cf_name=cf_name) + if stored is not UNDEFINED: return self._deserialize_value(stored) return default @_validate_transaction_state - def set(self, key: Any, value: Any, cf_name: str = "default"): + def set(self, key: Any, value: Any, prefix: bytes, cf_name: str = "default"): """ Set a key to the store. It first updates the key in the update cache. :param key: key to store in DB + :param prefix: a key prefix :param value: value to store in DB :param cf_name: rocksdb column family name. Default - "default" """ try: - key_serialized = self._serialize_key(key) + key_serialized = self._serialize_key(key, prefix=prefix) value_serialized = self._serialize_value(value) - self._update_cache.setdefault(cf_name, {}).setdefault(self._prefix, {})[ + self._update_cache.setdefault(cf_name, {}).setdefault(prefix, {})[ key_serialized ] = value_serialized except Exception: @@ -228,47 +208,49 @@ def set(self, key: Any, value: Any, cf_name: str = "default"): raise @_validate_transaction_state - def delete(self, key: Any, cf_name: str = "default"): + def delete(self, key: Any, prefix: bytes, cf_name: str = "default"): """ Delete a key from the store. It first deletes the key from the update cache. - :param key: key to delete from DB + :param key: a key to delete from DB + :param prefix: a key prefix :param cf_name: rocksdb column family name. Default - "default" """ try: - key_serialized = self._serialize_key(key) - self._update_cache.setdefault(cf_name, {}).setdefault(self._prefix, {})[ + key_serialized = self._serialize_key(key, prefix=prefix) + self._update_cache.setdefault(cf_name, {}).setdefault(prefix, {})[ key_serialized - ] = _deleted + ] = DELETED except Exception: self._failed = True raise @_validate_transaction_state - def exists(self, key: Any, cf_name: str = "default") -> bool: + def exists(self, key: Any, prefix: bytes, cf_name: str = "default") -> bool: """ Check if a key exists in the store. It first looks up the key in the update cache. :param key: a key to check in DB + :param prefix: a key prefix :param cf_name: rocksdb column family name. Default - "default" :return: `True` if the key exists, `False` otherwise. """ - key_serialized = self._serialize_key(key) + key_serialized = self._serialize_key(key, prefix=prefix) cached = ( self._update_cache.get(cf_name, {}) - .get(self._prefix, {}) - .get(key_serialized, _undefined) + .get(prefix, {}) + .get(key_serialized, UNDEFINED) ) - if cached is _deleted: + if cached is DELETED: return False - if cached is not _undefined: + if cached is not UNDEFINED: return True return self._partition.exists(key_serialized, cf_name=cf_name) @@ -309,7 +291,7 @@ def _update_changelog(self, meta_cf_handle: ColumnFamily): for key, value in prefix_update_cache.items(): self._partition.produce_to_changelog( key=key, - value=value if value is not _deleted else None, + value=value if value is not DELETED else None, headers=headers, ) offset += 1 @@ -341,7 +323,7 @@ def maybe_flush(self, offset: Optional[int] = None): cf_handle = self._partition.get_column_family_handle(cf_name) for _prefix, prefix_update_cache in cf_update_cache.items(): for key, value in prefix_update_cache.items(): - if value is _deleted: + if value is DELETED: self._batch.delete(key, cf_handle) else: self._batch.put(key, value, cf_handle) @@ -370,9 +352,9 @@ def _serialize_value(self, value: Any) -> bytes: def _deserialize_value(self, value: bytes) -> Any: return deserialize(value, loads=self._loads) - def _serialize_key(self, key: Any) -> bytes: + def _serialize_key(self, key: Any, prefix: bytes) -> bytes: key_bytes = serialize(key, dumps=self._dumps) - prefix = self._prefix + PREFIX_SEPARATOR if self._prefix else b"" + prefix = prefix + PREFIX_SEPARATOR if prefix else b"" return prefix + key_bytes def __enter__(self): diff --git a/quixstreams/state/rocksdb/windowed/state.py b/quixstreams/state/rocksdb/windowed/state.py index 2b068d003..6a1071732 100644 --- a/quixstreams/state/rocksdb/windowed/state.py +++ b/quixstreams/state/rocksdb/windowed/state.py @@ -7,15 +7,18 @@ class WindowedTransactionState(WindowedState): - __slots__ = ("_transaction",) + __slots__ = ("_transaction", "_prefix") - def __init__(self, transaction: "WindowedRocksDBPartitionTransaction"): + def __init__( + self, transaction: "WindowedRocksDBPartitionTransaction", prefix: bytes + ): """ A windowed state to be provided into `StreamingDataFrame` window functions. :param transaction: instance of `WindowedRocksDBPartitionTransaction` """ self._transaction = transaction + self._prefix = prefix def get_window( self, start_ms: int, end_ms: int, default: Any = None @@ -30,7 +33,7 @@ def get_window( :return: value or None if the key is not found and `default` is not provided """ return self._transaction.get_window( - start_ms=start_ms, end_ms=end_ms, default=default + start_ms=start_ms, end_ms=end_ms, default=default, prefix=self._prefix ) def update_window(self, start_ms: int, end_ms: int, value: Any, timestamp_ms: int): @@ -47,7 +50,11 @@ def update_window(self, start_ms: int, end_ms: int, value: Any, timestamp_ms: in :param timestamp_ms: current message timestamp in milliseconds """ return self._transaction.update_window( - start_ms=start_ms, end_ms=end_ms, timestamp_ms=timestamp_ms, value=value + start_ms=start_ms, + end_ms=end_ms, + timestamp_ms=timestamp_ms, + value=value, + prefix=self._prefix, ) def get_latest_timestamp(self) -> int: @@ -74,5 +81,5 @@ def expire_windows( "latest timestamp". """ return self._transaction.expire_windows( - duration_ms=duration_ms, grace_ms=grace_ms + duration_ms=duration_ms, grace_ms=grace_ms, prefix=self._prefix ) diff --git a/quixstreams/state/rocksdb/windowed/transaction.py b/quixstreams/state/rocksdb/windowed/transaction.py index 9e674442f..754bcd96a 100644 --- a/quixstreams/state/rocksdb/windowed/transaction.py +++ b/quixstreams/state/rocksdb/windowed/transaction.py @@ -10,8 +10,8 @@ LATEST_TIMESTAMP_KEY, PREFIX_SEPARATOR, ) -from ..transaction import RocksDBPartitionTransaction, _deleted from ..serialization import int_to_int64_bytes, serialize +from ..transaction import RocksDBPartitionTransaction, DELETED, DEFAULT_PREFIX from ..types import LoadsFunc, DumpsFunc if TYPE_CHECKING: @@ -30,12 +30,17 @@ def __init__( ): super().__init__(partition=partition, dumps=dumps, loads=loads) self._partition = cast("WindowedRocksDBStorePartition", self._partition) - self._state = WindowedTransactionState(transaction=self) self._latest_timestamp_ms = latest_timestamp_ms - @property - def state(self) -> "WindowedTransactionState": - return self._state + def as_state(self, prefix: Any = DEFAULT_PREFIX) -> WindowedTransactionState: + return WindowedTransactionState( + transaction=self, + prefix=( + prefix + if isinstance(prefix, bytes) + else serialize(prefix, dumps=self._dumps) + ), + ) def get_latest_timestamp(self) -> int: return self._latest_timestamp_ms @@ -47,24 +52,32 @@ def _validate_duration(self, start_ms: int, end_ms: int): f"than window start {start_ms}" ) - def get_window(self, start_ms: int, end_ms: int, default: Any = None) -> Any: + def get_window( + self, + start_ms: int, + end_ms: int, + prefix: bytes, + default: Any = None, + ) -> Any: self._validate_duration(start_ms=start_ms, end_ms=end_ms) key = encode_window_key(start_ms, end_ms) - return self.get(key=key, default=default) + return self.get(key=key, default=default, prefix=prefix) - def update_window(self, start_ms: int, end_ms: int, value: Any, timestamp_ms: int): + def update_window( + self, start_ms: int, end_ms: int, value: Any, timestamp_ms: int, prefix: bytes + ): if timestamp_ms < 0: raise ValueError("Timestamp cannot be negative") self._validate_duration(start_ms=start_ms, end_ms=end_ms) key = encode_window_key(start_ms, end_ms) - self.set(key=key, value=value) + self.set(key=key, value=value, prefix=prefix) self._latest_timestamp_ms = max(self._latest_timestamp_ms, timestamp_ms) - def delete_window(self, start_ms: int, end_ms: int): + def delete_window(self, start_ms: int, end_ms: int, prefix: bytes): self._validate_duration(start_ms=start_ms, end_ms=end_ms) key = encode_window_key(start_ms, end_ms) - self.delete(key=key) + self.delete(key=key, prefix=prefix) def maybe_flush(self, offset: Optional[int] = None): cf_handle = self._partition.get_column_family_handle(METADATA_CF_NAME) @@ -77,7 +90,7 @@ def maybe_flush(self, offset: Optional[int] = None): self._partition.set_latest_timestamp(self._latest_timestamp_ms) def expire_windows( - self, duration_ms: int, grace_ms: int = 0 + self, duration_ms: int, prefix: bytes, grace_ms: int = 0 ) -> List[Tuple[Tuple[int, int], Any]]: """ Get a list of expired windows from RocksDB considering latest timestamp, @@ -104,7 +117,8 @@ def expire_windows( # Find the latest start timestamp of the expired windows for the given key last_expired = self.get( - LATEST_EXPIRED_WINDOW_TIMESTAMP_KEY, + key=LATEST_EXPIRED_WINDOW_TIMESTAMP_KEY, + prefix=prefix, cf_name=LATEST_EXPIRED_WINDOW_CF_NAME, ) if last_expired is not None: @@ -115,30 +129,30 @@ def expire_windows( expired_windows = self._get_windows( start_from_ms=start_from, start_to_ms=start_to, + prefix=prefix, ) if expired_windows: # Save the start of the latest expired window to the expiration index latest_window = expired_windows[-1] last_expired__gt = latest_window[0][0] self.set( - LATEST_EXPIRED_WINDOW_TIMESTAMP_KEY, - last_expired__gt, + key=LATEST_EXPIRED_WINDOW_TIMESTAMP_KEY, + value=last_expired__gt, + prefix=prefix, cf_name=LATEST_EXPIRED_WINDOW_CF_NAME, ) # Delete expired windows from the state for (start, end), _ in expired_windows: - self.delete_window(start, end) + self.delete_window(start, end, prefix=prefix) return expired_windows - def _serialize_key(self, key: Any) -> bytes: + def _serialize_key(self, key: Any, prefix: bytes) -> bytes: # Allow bytes keys in WindowedStore key_bytes = key if isinstance(key, bytes) else serialize(key, dumps=self._dumps) - return self._prefix + PREFIX_SEPARATOR + key_bytes + return prefix + PREFIX_SEPARATOR + key_bytes def _get_windows( - self, - start_from_ms: int, - start_to_ms: int, + self, start_from_ms: int, start_to_ms: int, prefix: bytes ) -> List[Tuple[Tuple[int, int], Any]]: """ Get all windows starting between "start_from" and "start_to" @@ -156,11 +170,11 @@ def _get_windows( # Iterate over rocksdb within the given prefix and (start_form, start_to) # timestamps seek_from = max(start_from_ms, 0) - seek_from_key = encode_window_prefix(prefix=self._prefix, start_ms=seek_from) + seek_from_key = encode_window_prefix(prefix=prefix, start_ms=seek_from) # Add +1 to make the "start_to" inclusive seek_to = start_to_ms + 1 - seek_to_key = encode_window_prefix(prefix=self._prefix, start_ms=seek_to) + seek_to_key = encode_window_prefix(prefix=prefix, start_ms=seek_to) # Set iterator bounds to reduce the potential IO read_opt = ReadOptions() @@ -176,10 +190,10 @@ def _get_windows( windows[(start, end)] = self._deserialize_value(value) for window_key, window_value in ( - self._update_cache["default"].get(self._prefix, {}).items() + self._update_cache["default"].get(prefix, {}).items() ): message_key, start, end = parse_window_key(window_key) - if window_value is _deleted: + if window_value is DELETED: windows.pop((start, end), None) continue elif start_from_ms < start <= start_to_ms: diff --git a/quixstreams/state/state.py b/quixstreams/state/state.py index 7262cdcf1..3e06d9882 100644 --- a/quixstreams/state/state.py +++ b/quixstreams/state/state.py @@ -4,14 +4,18 @@ class TransactionState(State): - __slots__ = ("_transaction",) + __slots__ = ( + "_transaction", + "_prefix", + ) - def __init__(self, transaction: PartitionTransaction): + def __init__(self, prefix: bytes, transaction: PartitionTransaction): """ Simple key-value state to be provided into `StreamingDataFrame` functions :param transaction: instance of `PartitionTransaction` """ + self._prefix = prefix self._transaction = transaction def get(self, key: Any, default: Any = None) -> Optional[Any]: @@ -22,7 +26,7 @@ def get(self, key: Any, default: Any = None) -> Optional[Any]: :param default: default value to return if the key is not found :return: value or None if the key is not found and `default` is not provided """ - return self._transaction.get(key=key, default=default) + return self._transaction.get(key=key, prefix=self._prefix, default=default) def set(self, key: Any, value: Any): """ @@ -30,7 +34,7 @@ def set(self, key: Any, value: Any): :param key: key :param value: value """ - return self._transaction.set(key=key, value=value) + return self._transaction.set(key=key, value=value, prefix=self._prefix) def delete(self, key: Any): """ @@ -39,7 +43,7 @@ def delete(self, key: Any): This function always returns `None`, even if value is not found. :param key: key """ - return self._transaction.delete(key=key) + return self._transaction.delete(key=key, prefix=self._prefix) def exists(self, key: Any) -> bool: """ @@ -48,4 +52,4 @@ def exists(self, key: Any) -> bool: :return: True if key exists, False otherwise """ - return self._transaction.exists(key=key) + return self._transaction.exists(key=key, prefix=self._prefix) diff --git a/quixstreams/state/types.py b/quixstreams/state/types.py index 25f9b447c..56cb77113 100644 --- a/quixstreams/state/types.py +++ b/quixstreams/state/types.py @@ -1,8 +1,4 @@ -import contextlib - -from typing import Protocol, Any, Optional, Iterator, Callable, Dict, ClassVar - -from typing_extensions import Self +from typing import Protocol, Any, Optional, Callable, Dict, ClassVar from quixstreams.models import ConfluentKafkaMessageProto from quixstreams.models.types import MessageHeadersMapping @@ -60,9 +56,7 @@ def revoke_partition(self, partition: int): """ ... - def start_partition_transaction( - self, partition: int - ) -> Optional["PartitionTransaction"]: + def start_partition_transaction(self, partition: int) -> "PartitionTransaction": """ Start a new partition transaction. @@ -189,17 +183,59 @@ def exists(self, key: Any) -> bool: ... -class PartitionTransaction(State): +class PartitionTransaction(Protocol): """ A transaction class to perform simple key-value operations like "get", "set", "delete" and "exists" on a single storage partition. """ - @property - def state(self) -> State: + def as_state(self, prefix: Any) -> State: """ - An instance of State to be provided to `StreamingDataFrame` functions - :return: + Create an instance implementing the `State` protocol to be provided + to `StreamingDataFrame` functions. + All operations called on this State object will be prefixed with + the supplied `prefix`. + + :return: an instance implementing the `State` protocol + """ + ... + + def get(self, key: Any, prefix: bytes, default: Any = None) -> Optional[Any]: + """ + Get the value for key if key is present in the state, else default + + :param key: key + :param prefix: a key prefix + :param default: default value to return if the key is not found + :return: value or None if the key is not found and `default` is not provided + """ + ... + + def set(self, key: Any, prefix: bytes, value: Any): + """ + Set value for the key. + :param key: key + :param prefix: a key prefix + :param value: value + """ + ... + + def delete(self, key: Any, prefix: bytes): + """ + Delete value for the key. + + This function always returns `None`, even if value is not found. + :param key: key + :param prefix: a key prefix + """ + ... + + def exists(self, key: Any, prefix: bytes) -> bool: + """ + Check if the key exists in state. + :param key: key + :param prefix: a key prefix + :return: True if key exists, False otherwise """ ... @@ -223,18 +259,6 @@ def completed(self) -> bool: """ ... - @contextlib.contextmanager - def with_prefix(self, prefix: Any = b"") -> Iterator[Self]: - """ - A context manager set the prefix for all keys in the scope. - - Normally, it's called by `StreamingDataFrame` internals to ensure that every - message key is stored separately. - :param prefix: key prefix - :return: context manager - """ - ... - def maybe_flush(self, offset: Optional[int] = None): """ Flush the recent updates and last processed offset to the storage. @@ -305,9 +329,7 @@ def expire_windows(self, duration_ms: int, grace_ms: int = 0): ... -class WindowedPartitionTransaction(WindowedState): - @property - def state(self) -> WindowedState: ... +class WindowedPartitionTransaction(Protocol): @property def failed(self) -> bool: @@ -329,14 +351,67 @@ def completed(self) -> bool: """ ... - def with_prefix(self, prefix: Any = b"") -> Iterator[Self]: + def as_state(self, prefix: Any) -> WindowedState: ... + + def get_window( + self, + start_ms: int, + end_ms: int, + prefix: bytes, + default: Any = None, + ) -> Optional[Any]: """ - A context manager set the prefix for all keys in the scope. + Get the value of the window defined by `start` and `end` timestamps + if the window is present in the state, else default + + :param start_ms: start of the window in milliseconds + :param end_ms: end of the window in milliseconds + :param prefix: a key prefix + :param default: default value to return if the key is not found + :return: value or None if the key is not found and `default` is not provided + """ + ... + + def update_window( + self, start_ms: int, end_ms: int, value: Any, timestamp_ms: int, prefix: bytes + ): + """ + Set a value for the window. + + This method will also update the latest observed timestamp in state partition + using the provided `timestamp`. + + :param start_ms: start of the window in milliseconds + :param end_ms: end of the window in milliseconds + :param value: value of the window + :param timestamp_ms: current message timestamp in milliseconds + :param prefix: a key prefix + """ + ... + + def get_latest_timestamp(self) -> int: + """ + Get the latest observed timestamp for the current state partition. + + Use this timestamp to determine if the arriving event is late and should be + discarded from the processing. + + :return: latest observed event timestamp in milliseconds + """ + ... + + def expire_windows(self, duration_ms: int, prefix: bytes, grace_ms: int = 0): + """ + Get a list of expired windows from RocksDB considering the current + latest timestamp, window duration and grace period. - Normally, it's called by `StreamingDataFrame` internals to ensure that every - message key is stored separately. - :param prefix: key prefix - :return: context manager + It also marks the latest found window as expired in the expiration index, so + calling this method multiple times will yield different results for the same + "latest timestamp". + + :param duration_ms: duration of the windows in milliseconds + :param prefix: a key prefix + :param grace_ms: grace period in milliseconds. Default - "0" """ ... diff --git a/tests/test_quixstreams/fixtures.py b/tests/test_quixstreams/fixtures.py index 636035115..30da3fc65 100644 --- a/tests/test_quixstreams/fixtures.py +++ b/tests/test_quixstreams/fixtures.py @@ -276,6 +276,7 @@ def app_factory(kafka_container, random_consumer_group, tmp_path): def factory( consumer_group: Optional[str] = None, auto_offset_reset: AutoOffsetReset = "latest", + commit_interval: float = 5.0, consumer_extra_config: Optional[dict] = None, producer_extra_config: Optional[dict] = None, on_consumer_error: Optional[ConsumerErrorCallback] = None, @@ -292,6 +293,7 @@ def factory( broker_address=kafka_container.broker_address, consumer_group=consumer_group or random_consumer_group, auto_offset_reset=auto_offset_reset, + commit_interval=commit_interval, consumer_extra_config=consumer_extra_config, producer_extra_config=producer_extra_config, on_consumer_error=on_consumer_error, diff --git a/tests/test_quixstreams/test_app.py b/tests/test_quixstreams/test_app.py index 1d7c9e2e4..89f107dcc 100644 --- a/tests/test_quixstreams/test_app.py +++ b/tests/test_quixstreams/test_app.py @@ -18,6 +18,7 @@ JSONDeserializer, SerializationError, JSONSerializer, + TopicConfig, ) from quixstreams.platforms.quix import ( QuixKafkaConfigsBuilder, @@ -848,8 +849,7 @@ def count(_, state: State): store = state_manager.get_store(topic=topic_in.name, store_name="default") with store.start_partition_transaction(partition=partition_num) as tx: # All keys in state must be prefixed with the message key - with tx.with_prefix(message_key): - assert tx.get("total") == total_consumed.result() + assert tx.get("total", prefix=message_key) == total_consumed.result() def test_run_stateful_processing_fails( self, @@ -860,7 +860,6 @@ def test_run_stateful_processing_fails( ): consumer_group = str(uuid.uuid4()) state_dir = (tmp_path / "state").absolute() - partition_num = 0 app = app_factory( consumer_group=consumer_group, auto_offset_reset="earliest", @@ -885,14 +884,12 @@ def fail(*_): total_messages = 3 # Produce messages to the topic and flush - data = { - "key": b"key", - "value": dumps({"key": "value"}), - "partition": partition_num, - } + key = b"key" + value = dumps({"key": "value"}) + with app.get_producer() as producer: for _ in range(total_messages): - producer.produce(topic_in.name, **data) + producer.produce(topic_in.name, key=key, value=value) # Stop app when the future is resolved executor.submit(_stop_app_on_future, app, failed, 10.0) @@ -905,11 +902,11 @@ def fail(*_): ) state_manager.register_store(topic_in.name, "default") state_manager.on_partition_assign( - TopicPartitionStub(topic=topic_in.name, partition=partition_num) + TopicPartitionStub(topic=topic_in.name, partition=0) ) store = state_manager.get_store(topic=topic_in.name, store_name="default") - with store.start_partition_transaction(partition=partition_num) as tx: - assert tx.get("total") is None + with store.start_partition_transaction(partition=0) as tx: + assert tx.get("total", prefix=key) is None def test_run_stateful_suppress_processing_errors( self, @@ -973,8 +970,7 @@ def fail(_): ) store = state_manager.get_store(topic=topic_in.name, store_name="default") with store.start_partition_transaction(partition=partition_num) as tx: - with tx.with_prefix(message_key): - assert tx.get("total") == total_consumed.result() + assert tx.get("total", prefix=message_key) == total_consumed.result() def test_on_assign_topic_offset_behind_warning( self, @@ -1003,11 +999,11 @@ def test_on_assign_topic_offset_behind_warning( state_partitions = state_manager.on_partition_assign( TopicPartitionStub(topic=topic_in.name, partition=partition_num) ) - with state_manager.start_store_transaction( - topic=topic_in.name, partition=partition_num, offset=9999 - ): - tx = state_manager.get_store_transaction() - tx.set("key", "value") + store = state_manager.get_store(topic_in.name, "default") + tx = store.start_partition_transaction(partition_num) + # Do some change to probe the Writebatch + tx.set("key", "value", prefix=b"__key__") + tx.maybe_flush(offset=9999) assert state_partitions[partition_num].get_processed_offset() == 9999 # Define some stateful function so the App assigns store partitions @@ -1057,7 +1053,7 @@ def test_clear_state( ) topic_in_name, _ = topic_factory() - tx_prefix = b"key" + prefix = b"key" state_manager = state_manager_factory( group_id=consumer_group, state_dir=state_dir @@ -1072,8 +1068,7 @@ def test_clear_state( store = state_manager.get_store(topic=topic_in_name, store_name="default") with store.start_partition_transaction(partition=0) as tx: # All keys in state must be prefixed with the message key - with tx.with_prefix(tx_prefix): - tx.set("my_state", True) + tx.set(key="my_state", value=True, prefix=prefix) # Clear the state app.clear_state() @@ -1086,9 +1081,7 @@ def test_clear_state( ) store = state_manager.get_store(topic=topic_in_name, store_name="default") with store.start_partition_transaction(partition=0) as tx: - # All keys in state must be prefixed with the message key - with tx.with_prefix(tx_prefix): - assert tx.get("my_state") is None + assert tx.get("my_state", prefix=prefix) is None def test_app_use_changelog_false(self, app_factory): """ @@ -1099,7 +1092,7 @@ def test_app_use_changelog_false(self, app_factory): assert not app._state_manager.using_changelogs -class TestAppRecovery: +class TestApplicationRecovery: def test_changelog_recovery_default_store( self, app_factory, @@ -1130,14 +1123,18 @@ def sum_value(value: dict, state: State): def get_app(): app = app_factory( + commit_interval=0, # Commit every processed message auto_offset_reset="earliest", - use_changelog_topics="True", + use_changelog_topics=True, on_message_processed=on_message_processed, consumer_group=consumer_group, state_dir=state_dir, ) topic = app.topic( - topic_name, config=app._topic_manager.topic_config(num_partitions=2) + topic_name, + config=TopicConfig( + num_partitions=len(partition_msg_count), replication_factor=1 + ), ) sdf = app.dataframe(topic) sdf = sdf.apply(sum_value, stateful=True) @@ -1149,23 +1146,19 @@ def validate_state(): state_dir=state_dir, ) as state_manager: state_manager.register_store(topic.name, store_name) - for p_num in partition_msg_count: + for p_num, count in partition_msg_count.items(): state_manager.on_partition_assign( TopicPartitionStub(topic=topic.name, partition=p_num) ) - store = state_manager.get_store(topic=topic.name, store_name=store_name) - for p_num, count in partition_msg_count.items(): - assert store._partitions[p_num].get_changelog_offset() == count - with store.start_partition_transaction(partition=p_num) as tx: - # All keys in state must be prefixed with the message key - with tx.with_prefix(f"key{p_num}".encode()): - assert tx.get(sum_key) == count * msg_int_value - - for p_num in partition_msg_count: - state_manager.on_partition_revoke( - TopicPartitionStub(topic=topic.name, partition=p_num) + store = state_manager.get_store( + topic=topic.name, store_name=store_name ) - state_manager.clear_stores() + partition = store.partitions[p_num] + assert partition.get_changelog_offset() == count + with partition.begin() as tx: + # All keys in state must be prefixed with the message key + prefix = f"key{p_num}".encode() + assert tx.get(sum_key, prefix=prefix) == count * msg_int_value # Produce messages to the topic and flush app, sdf, topic = get_app() @@ -1174,25 +1167,26 @@ def validate_state(): serialized = topic.serialize( key=f"key{p_num}".encode(), value={"my_value": msg_int_value} ) - data = { - "key": serialized.key, - "value": serialized.value, - "partition": p_num, - } for _ in range(count): - producer.produce(topic.name, **data) + producer.produce( + topic.name, + key=serialized.key, + value=serialized.value, + partition=p_num, + ) - # run app to populate state + # run app to populate state with data done = Future() executor.submit(_stop_app_on_future, app, done, 10.0) app.run(sdf) # validate and then delete the state assert processed_count == partition_msg_count - processed_count = {0: 0, 1: 0} validate_state() # run the app again and validate the recovered state + processed_count = {0: 0, 1: 0} app, sdf, topic = get_app() + app.clear_state() done = Future() executor.submit(_stop_app_on_future, app, done, 10.0) app.run(sdf) @@ -1253,14 +1247,18 @@ def on_message_processed(topic_, partition, offset): def get_app(): app = app_factory( + commit_interval=0, # Commit every processed message auto_offset_reset="earliest", - use_changelog_topics="True", + use_changelog_topics=True, consumer_group=consumer_group, on_message_processed=on_message_processed, state_dir=state_dir, ) topic = app.topic( - topic_name, config=app._topic_manager.topic_config(num_partitions=2) + topic_name, + config=TopicConfig( + num_partitions=len(partition_msg_count), replication_factor=1 + ), ) sdf = app.dataframe(topic) sdf = sdf.apply(lambda row: row["my_value"]) @@ -1286,12 +1284,14 @@ def validate_state(): group_id=consumer_group, state_dir=state_dir ) as state_manager: state_manager.register_windowed_store(topic.name, store_name) - for p_num in partition_timestamps: + for p_num, windows in expected_window_updates.items(): state_manager.on_partition_assign( TopicPartitionStub(topic=topic.name, partition=p_num) ) - store = state_manager.get_store(topic=topic.name, store_name=store_name) - for p_num, windows in expected_window_updates.items(): + store = state_manager.get_store( + topic=topic.name, store_name=store_name + ) + # in this test, each expiration check only deletes one window, # simplifying the offset counting. expected_offset = sum( @@ -1299,25 +1299,21 @@ def validate_state(): ) + 2 * len(expected_expired_windows[p_num]) assert ( expected_offset - == store._partitions[p_num].get_changelog_offset() + == store.partitions[p_num].get_changelog_offset() ) - with store.start_partition_transaction(partition=p_num) as tx: - with tx.with_prefix(f"key{p_num}".encode()): - for window, count in windows.items(): - expected = count - if window in expected_expired_windows[p_num]: - expected = None - else: - # each message value was 10 - expected *= msg_int_value - assert tx.get_window(*window) == expected - - for p_num in partition_timestamps: - state_manager.on_partition_revoke( - TopicPartitionStub(topic=topic.name, partition=p_num) - ) - state_manager.clear_stores() + partition = store.partitions[p_num] + + with partition.begin() as tx: + prefix = f"key{p_num}".encode() + for window, count in windows.items(): + expected = count + if window in expected_expired_windows[p_num]: + expected = None + else: + # each message value was 10 + expected *= msg_int_value + assert tx.get_window(*window, prefix=prefix) == expected app, sdf, topic = get_app() # Produce messages to the topic and flush @@ -1341,11 +1337,12 @@ def validate_state(): app.run(sdf) # validate and then delete the state assert processed_count == partition_msg_count - processed_count = {0: 0, 1: 0} validate_state() # run the app again and validate the recovered state + processed_count = {0: 0, 1: 0} app, sdf, topic = get_app() + app.clear_state() done = Future() executor.submit(_stop_app_on_future, app, done, 10.0) app.run(sdf) diff --git a/tests/test_quixstreams/test_checkpoint.py b/tests/test_quixstreams/test_checkpoint.py new file mode 100644 index 000000000..6ee42251e --- /dev/null +++ b/tests/test_quixstreams/test_checkpoint.py @@ -0,0 +1,166 @@ +import contextlib +from unittest.mock import patch + +import pytest + +from quixstreams.state.exceptions import ( + StoreNotRegisteredError, + InvalidStoreTransactionStateError, +) +from quixstreams.state.rocksdb import RocksDBPartitionTransaction +from tests.utils import TopicPartitionStub + + +@pytest.mark.skip("Checkpoint tests") +class TestCheckpoint: + def test_get_store_transaction_store_not_registered_fails(self, state_manager): + with pytest.raises(StoreNotRegisteredError): + with state_manager.start_store_transaction("topic", 0, 0): + ... + + def test_get_store_transaction_not_started(self, state_manager): + with pytest.raises(InvalidStoreTransactionStateError): + state_manager.get_store_transaction("store") + + def test_store_transaction_success(self, state_manager): + state_manager.register_store("topic", "store") + tp = TopicPartitionStub("topic", 0) + state_manager.on_partition_assign(tp) + + store = state_manager.get_store("topic", "store") + store_partition = store.partitions[0] + + assert store_partition.get_processed_offset() is None + + with state_manager.start_store_transaction("topic", partition=0, offset=1): + tx = state_manager.get_store_transaction("store") + tx.set("some_key", "some_value", prefix=b"__key__") + + state_manager.on_partition_assign(tp) + + store = state_manager.get_store("topic", "store") + store_partition = store.partitions[0] + + assert store_partition.get_processed_offset() == 1 + + def test_store_transaction_no_flush_if_partition_transaction_failed( + self, state_manager + ): + """ + Ensure that no PartitionTransactions are flushed to the DB if + any of them fails + """ + state_manager.register_store("topic", "store1") + state_manager.register_store("topic", "store2") + state_manager.on_partition_assign(TopicPartitionStub("topic", 0)) + store1 = state_manager.get_store("topic", "store1") + store2 = state_manager.get_store("topic", "store2") + + with state_manager.start_store_transaction("topic", partition=0, offset=1): + tx_store1 = state_manager.get_store_transaction("store1") + tx_store2 = state_manager.get_store_transaction("store2") + # Simulate exception in one of the transactions + with contextlib.suppress(ValueError), patch.object( + RocksDBPartitionTransaction, + "_serialize_key", + side_effect=ValueError("test"), + ): + tx_store1.set("some_key", "some_value") + tx_store2.set("some_key", "some_value") + + assert store1.partitions[0].get_processed_offset() is None + assert store2.partitions[0].get_processed_offset() is None + + def test_start_store_transaction_already_started(self, state_manager): + state_manager.register_store("topic", "store") + with state_manager.start_store_transaction("topic", partition=0, offset=0): + with pytest.raises(InvalidStoreTransactionStateError): + with state_manager.start_store_transaction( + "topic", partition=0, offset=0 + ): + ... + + def test_store_transaction_no_flush_on_exception(self, state_manager): + state_manager.register_store("topic", "store") + state_manager.on_partition_assign(TopicPartitionStub("topic", 0)) + store = state_manager.get_store("topic", "store") + + with contextlib.suppress(Exception): + with state_manager.start_store_transaction("topic", partition=0, offset=1): + tx = state_manager.get_store_transaction("store") + tx.set("some_key", "some_value") + raise ValueError() + + store_partition = store.partitions[0] + assert store_partition.get_processed_offset() is None + + +@pytest.mark.skip("Checkpoint tests") +class TestCheckpointChangelog: + def test_store_transaction_no_flush_on_exception( + self, + state_manager_changelogs, + ): + state_manager = state_manager_changelogs + recovery_manager = state_manager._recovery_manager + topic_manager = recovery_manager._topic_manager + producer = state_manager._producer + consumer = recovery_manager._consumer + + consumer.get_watermark_offsets.return_value = (0, 10) + topic_manager.topic(name="topic") + state_manager.register_store("topic", store_name="store") + state_manager.on_partition_assign(TopicPartitionStub("topic", 0)) + store = state_manager.get_store("topic", "store") + + with contextlib.suppress(Exception): + with state_manager.start_store_transaction("topic", partition=0, offset=1): + tx = state_manager.get_store_transaction("store") + tx.set("some_key", "some_value") + raise ValueError() + + store_partition = store.partitions[0] + assert store_partition.get_processed_offset() is None + assert store_partition.get_changelog_offset() is None + producer.produce.assert_not_called() + + def test_store_transaction_no_flush_if_partition_transaction_failed( + self, + state_manager_changelogs, + ): + """ + Ensure that no PartitionTransactions are flushed to the DB if + any of them fails + """ + state_manager = state_manager_changelogs + recovery_manager = state_manager._recovery_manager + topic_manager = recovery_manager._topic_manager + producer = state_manager._producer + consumer = recovery_manager._consumer + + consumer.get_watermark_offsets.return_value = (0, 10) + topic_manager.topic(name="topic") + state_manager.register_store("topic", store_name="store1") + state_manager.register_store("topic", store_name="store2") + state_manager.on_partition_assign(TopicPartitionStub("topic", 0)) + + store1 = state_manager.get_store("topic", "store1") + store2 = state_manager.get_store("topic", "store2") + + with state_manager.start_store_transaction("topic", partition=0, offset=1): + tx_store1 = state_manager.get_store_transaction("store1") + tx_store2 = state_manager.get_store_transaction("store2") + # Simulate exception in one of the transactions + with contextlib.suppress(ValueError), patch.object( + RocksDBPartitionTransaction, + "_serialize_key", + side_effect=ValueError("test"), + ): + tx_store1.set("some_key", "some_value") + tx_store2.set("some_key", "some_value") + + assert store1.partitions[0].get_processed_offset() is None + assert store1.partitions[0].get_changelog_offset() is None + assert store2.partitions[0].get_processed_offset() is None + assert store2.partitions[0].get_changelog_offset() is None + producer.produce.assert_not_called() diff --git a/tests/test_quixstreams/test_dataframe/fixtures.py b/tests/test_quixstreams/test_dataframe/fixtures.py index ac61f0c83..3d8c7f706 100644 --- a/tests/test_quixstreams/test_dataframe/fixtures.py +++ b/tests/test_quixstreams/test_dataframe/fixtures.py @@ -5,6 +5,9 @@ from quixstreams.dataframe.dataframe import StreamingDataFrame from quixstreams.models.topics import Topic +from quixstreams.processing_context import ProcessingContext +from quixstreams.rowconsumer import RowConsumer +from quixstreams.rowproducer import RowProducer from quixstreams.state import StateStoreManager @@ -13,10 +16,20 @@ def dataframe_factory(topic_manager_topic_factory): def factory( topic: Optional[Topic] = None, state_manager: Optional[StateStoreManager] = None, + producer: Optional[RowProducer] = None, ) -> StreamingDataFrame: - return StreamingDataFrame( - topic=topic or topic_manager_topic_factory("test"), - state_manager=state_manager or MagicMock(spec=StateStoreManager), + producer = producer if producer is not None else MagicMock(spec_set=RowProducer) + state_manager = state_manager or MagicMock(spec=StateStoreManager) + topic = topic or topic_manager_topic_factory("test") + + processing_ctx = ProcessingContext( + producer=producer, + consumer=MagicMock(spec_set=RowConsumer), + commit_interval=0, + state_manager=state_manager, ) + processing_ctx.init_checkpoint() + + return StreamingDataFrame(topic=topic, processing_context=processing_ctx) return factory diff --git a/tests/test_quixstreams/test_dataframe/test_dataframe.py b/tests/test_quixstreams/test_dataframe/test_dataframe.py index a7174cedd..729cadd82 100644 --- a/tests/test_quixstreams/test_dataframe/test_dataframe.py +++ b/tests/test_quixstreams/test_dataframe/test_dataframe.py @@ -7,7 +7,7 @@ from quixstreams.core.stream import Filtered from quixstreams.dataframe.exceptions import InvalidOperation from quixstreams.dataframe.windows import WindowResult -from quixstreams.models import MessageTimestamp, Topic +from quixstreams.models import MessageTimestamp from tests.utils import TopicPartitionStub @@ -291,8 +291,7 @@ def test_to_topic( ) producer = row_producer_factory() - sdf = dataframe_factory() - sdf.producer = producer + sdf = dataframe_factory(producer=producer) sdf = sdf.to_topic(topic) value = {"x": 1, "y": 2} @@ -331,8 +330,7 @@ def test_to_topic_apply_expand( ) producer = row_producer_factory() - sdf = dataframe_factory() - sdf.producer = producer + sdf = dataframe_factory(producer=producer) sdf = sdf.apply(lambda v: [v, v], expand=True).to_topic(topic) @@ -377,8 +375,7 @@ def test_to_topic_custom_key( ) producer = row_producer_factory() - sdf = dataframe_factory() - sdf.producer = producer + sdf = dataframe_factory(producer=producer) # Use value["x"] as a new key sdf = sdf.to_topic(topic, key=lambda v: v["x"]) @@ -412,17 +409,14 @@ def test_to_topic_multiple_topics_out( topic_manager_topic_factory, ): topic_0 = topic_manager_topic_factory( - value_serializer="json", - value_deserializer="json", + value_serializer="json", value_deserializer="json" ) topic_1 = topic_manager_topic_factory( - value_serializer="json", - value_deserializer="json", + value_serializer="json", value_deserializer="json" ) producer = row_producer_factory() - sdf = dataframe_factory() - sdf.producer = producer + sdf = dataframe_factory(producer=producer) sdf = sdf.to_topic(topic_0).to_topic(topic_1) @@ -453,29 +447,6 @@ def test_to_topic_multiple_topics_out( assert consumed_row.key == ctx.key assert consumed_row.value == value - def test_to_topic_no_producer_assigned( - self, dataframe_factory, topic_manager_topic_factory - ): - topic = topic_manager_topic_factory() - - sdf = dataframe_factory() - sdf = sdf.to_topic(topic) - - value = {"x": "1", "y": "2"} - ctx = MessageContext( - key=b"test", - topic="test", - partition=0, - offset=0, - size=0, - timestamp=MessageTimestamp.create(0, 0), - ) - - with pytest.raises( - RuntimeError, match="Producer instance has not been provided" - ): - sdf.test(value, ctx=ctx) - class TestStreamingDataframeStateful: def test_apply_stateful( @@ -513,10 +484,7 @@ def stateful_func(value_: dict, state: State) -> int: timestamp=MessageTimestamp.create(0, 0), ) for value in values: - with state_manager.start_store_transaction( - topic=ctx.topic, partition=ctx.partition, offset=ctx.offset - ): - result = sdf.test(value, ctx) + result = sdf.test(value, ctx) assert result == 10 @@ -555,10 +523,7 @@ def stateful_func(value_: dict, state: State): timestamp=MessageTimestamp.create(0, 0), ) for value in values: - with state_manager.start_store_transaction( - topic=ctx.topic, partition=ctx.partition, offset=ctx.offset - ): - result = sdf.test(value, ctx) + result = sdf.test(value, ctx) assert result is not None assert result["max"] == 10 @@ -599,13 +564,10 @@ def stateful_func(value_: dict, state: State): ) results = [] for value in values: - with state_manager.start_store_transaction( - topic=ctx.topic, partition=ctx.partition, offset=ctx.offset - ): - try: - results.append(sdf.test(value, ctx)) - except Filtered: - pass + try: + results.append(sdf.test(value, ctx)) + except Filtered: + pass assert len(results) == 1 assert results[0]["max"] == 3 @@ -645,13 +607,10 @@ def stateful_func(value_: dict, state: State): ) results = [] for value in values: - with state_manager.start_store_transaction( - topic=ctx.topic, partition=ctx.partition, offset=ctx.offset - ): - try: - results.append(sdf.test(value, ctx)) - except Filtered: - pass + try: + results.append(sdf.test(value, ctx)) + except Filtered: + pass assert len(results) == 1 assert results[0]["max"] == 3 @@ -724,10 +683,7 @@ def test_tumbling_window_current( results = [] for value, ctx in messages: - with state_manager.start_store_transaction( - topic=ctx.topic, partition=ctx.partition, offset=ctx.offset - ): - results += sdf.test(value=value, ctx=ctx) + results += sdf.test(value=value, ctx=ctx) assert len(results) == 3 assert results == [ WindowResult(value=1, start=0, end=10000), @@ -768,11 +724,8 @@ def test_tumbling_window_current_out_of_order_late( results = [] for value, ctx in messages: - with state_manager.start_store_transaction( - topic=ctx.topic, partition=ctx.partition, offset=ctx.offset - ): - result = sdf.test(value=value, ctx=ctx) - results += result + result = sdf.test(value=value, ctx=ctx) + results += result assert len(results) == 2 assert results == [ @@ -812,11 +765,7 @@ def test_tumbling_window_final( results = [] for value, ctx in messages: - with state_manager.start_store_transaction( - topic=ctx.topic, partition=ctx.partition, offset=ctx.offset - ): - result = sdf.test(value=value, ctx=ctx) - results += result + results += sdf.test(value=value, ctx=ctx) assert len(results) == 2 assert results == [ @@ -850,10 +799,7 @@ def test_tumbling_window_none_key_messages( results = [] for value, ctx in messages: - with state_manager.start_store_transaction( - topic=ctx.topic, partition=ctx.partition, offset=ctx.offset - ): - results += sdf.test(value=value, ctx=ctx) + results += sdf.test(value=value, ctx=ctx) assert len(results) == 2 # Ensure that the windows are returned with correct values and order @@ -961,10 +907,7 @@ def test_hopping_window_current( results = [] for value, ctx in messages: - with state_manager.start_store_transaction( - topic=ctx.topic, partition=ctx.partition, offset=ctx.offset - ): - results += sdf.test(value=value, ctx=ctx) + results += sdf.test(value=value, ctx=ctx) assert len(results) == 9 # Ensure that the windows are returned with correct values and order @@ -1008,10 +951,7 @@ def test_hopping_window_current_out_of_order_late( results = [] for value, ctx in messages: - with state_manager.start_store_transaction( - topic=ctx.topic, partition=ctx.partition, offset=ctx.offset - ): - results += sdf.test(value=value, ctx=ctx) + results += sdf.test(value=value, ctx=ctx) assert len(results) == 5 # Ensure that the windows are returned with correct values and order @@ -1053,11 +993,9 @@ def test_hopping_window_final( ] results = [] + for value, ctx in messages: - with state_manager.start_store_transaction( - topic=ctx.topic, partition=ctx.partition, offset=ctx.offset - ): - results += sdf.test(value=value, ctx=ctx) + results += sdf.test(value=value, ctx=ctx) assert len(results) == 3 # Ensure that the windows are returned with correct values and order @@ -1093,10 +1031,7 @@ def test_hopping_window_none_key_messages( results = [] for value, ctx in messages: - with state_manager.start_store_transaction( - topic=ctx.topic, partition=ctx.partition, offset=ctx.offset - ): - results += sdf.test(value=value, ctx=ctx) + results += sdf.test(value=value, ctx=ctx) assert len(results) == 2 # Ensure that the windows are returned with correct values and order diff --git a/tests/test_quixstreams/test_dataframe/test_windows/test_hopping.py b/tests/test_quixstreams/test_dataframe/test_windows/test_hopping.py index 93d5accf2..b72ab6fba 100644 --- a/tests/test_quixstreams/test_dataframe/test_windows/test_hopping.py +++ b/tests/test_quixstreams/test_dataframe/test_windows/test_hopping.py @@ -55,11 +55,11 @@ def test_hoppingwindow_count( store = state_manager.get_store(topic="test", store_name=window.name) store.assign_partition(0) with store.start_partition_transaction(0) as tx: - with tx.with_prefix(prefix=b"key"): - window.process_window(value=2, state=tx.state, timestamp_ms=100) - updated, expired = window.process_window( - value=1, state=tx.state, timestamp_ms=100 - ) + state = tx.as_state(prefix=b"key") + window.process_window(value=2, state=state, timestamp_ms=100) + updated, expired = window.process_window( + value=1, state=state, timestamp_ms=100 + ) assert len(updated) == 2 assert updated[0]["value"] == 2 assert updated[0]["start"] == 95 @@ -77,11 +77,11 @@ def test_hoppingwindow_sum(self, hopping_window_definition_factory, state_manage store = state_manager.get_store(topic="test", store_name=window.name) store.assign_partition(0) with store.start_partition_transaction(0) as tx: - with tx.with_prefix(prefix=b"key"): - window.process_window(value=2, state=tx.state, timestamp_ms=100) - updated, expired = window.process_window( - value=1, state=tx.state, timestamp_ms=100 - ) + state = tx.as_state(prefix=b"key") + window.process_window(value=2, state=state, timestamp_ms=100) + updated, expired = window.process_window( + value=1, state=state, timestamp_ms=100 + ) assert len(updated) == 2 assert updated[0]["value"] == 3 assert updated[0]["start"] == 95 @@ -99,11 +99,11 @@ def test_hoppingwindow_mean(self, hopping_window_definition_factory, state_manag store = state_manager.get_store(topic="test", store_name=window.name) store.assign_partition(0) with store.start_partition_transaction(0) as tx: - with tx.with_prefix(prefix=b"key"): - window.process_window(value=2, state=tx.state, timestamp_ms=100) - updated, expired = window.process_window( - value=1, state=tx.state, timestamp_ms=100 - ) + state = tx.as_state(prefix=b"key") + window.process_window(value=2, state=state, timestamp_ms=100) + updated, expired = window.process_window( + value=1, state=state, timestamp_ms=100 + ) assert len(updated) == 2 assert updated[0]["value"] == 1.5 assert updated[0]["start"] == 95 @@ -126,10 +126,10 @@ def test_hoppingwindow_reduce( store = state_manager.get_store(topic="test", store_name=window.name) store.assign_partition(0) with store.start_partition_transaction(0) as tx: - with tx.with_prefix(prefix=b"key"): - updated, expired = window.process_window( - value=1, state=tx.state, timestamp_ms=100 - ) + state = tx.as_state(prefix=b"key") + updated, expired = window.process_window( + value=1, state=state, timestamp_ms=100 + ) assert len(updated) == 2 assert updated[0]["value"] == [1] assert updated[0]["start"] == 95 @@ -147,10 +147,10 @@ def test_hoppingwindow_max(self, hopping_window_definition_factory, state_manage store = state_manager.get_store(topic="test", store_name=window.name) store.assign_partition(0) with store.start_partition_transaction(0) as tx: - with tx.with_prefix(prefix=b"key"): - updated, expired = window.process_window( - value=1, state=tx.state, timestamp_ms=100 - ) + state = tx.as_state(prefix=b"key") + updated, expired = window.process_window( + value=1, state=state, timestamp_ms=100 + ) assert len(updated) == 2 assert updated[0]["value"] == 1 assert updated[0]["start"] == 95 @@ -168,10 +168,10 @@ def test_hoppingwindow_min(self, hopping_window_definition_factory, state_manage store = state_manager.get_store(topic="test", store_name=window.name) store.assign_partition(0) with store.start_partition_transaction(0) as tx: - with tx.with_prefix(prefix=b"key"): - updated, expired = window.process_window( - value=1, state=tx.state, timestamp_ms=100 - ) + state = tx.as_state(prefix=b"key") + updated, expired = window.process_window( + value=1, state=state, timestamp_ms=100 + ) assert len(updated) == 2 assert updated[0]["value"] == 1 assert updated[0]["start"] == 95 @@ -218,31 +218,29 @@ def test_hopping_window_process_window_expired( store = state_manager.get_store(topic="test", store_name=window.name) store.assign_partition(0) with store.start_partition_transaction(0) as tx: - with tx.with_prefix(prefix=b"key"): - # Add item to the windows [95, 105) and [100, 110) - updated, expired = window.process_window( - value=1, state=tx.state, timestamp_ms=100 - ) - assert len(updated) == 2 - assert updated[0]["value"] == 1 - assert updated[0]["start"] == 95 - assert updated[0]["end"] == 105 - assert updated[1]["value"] == 1 - assert updated[1]["start"] == 100 - assert updated[1]["end"] == 110 - - assert not expired - - # Now add item to the windows [105, 115) and [110, 120) - # The windows [95, 105) and [100, 110) are now expired - # and should be returned - _, expired = window.process_window( - value=2, state=tx.state, timestamp_ms=110 - ) - assert len(expired) == 2 - assert expired[0]["value"] == 1 - assert expired[0]["start"] == 95 - assert expired[0]["end"] == 105 - assert expired[1]["value"] == 1 - assert expired[1]["start"] == 100 - assert expired[1]["end"] == 110 + state = tx.as_state(prefix=b"key") + # Add item to the windows [95, 105) and [100, 110) + updated, expired = window.process_window( + value=1, state=state, timestamp_ms=100 + ) + assert len(updated) == 2 + assert updated[0]["value"] == 1 + assert updated[0]["start"] == 95 + assert updated[0]["end"] == 105 + assert updated[1]["value"] == 1 + assert updated[1]["start"] == 100 + assert updated[1]["end"] == 110 + + assert not expired + + # Now add item to the windows [105, 115) and [110, 120) + # The windows [95, 105) and [100, 110) are now expired + # and should be returned + _, expired = window.process_window(value=2, state=state, timestamp_ms=110) + assert len(expired) == 2 + assert expired[0]["value"] == 1 + assert expired[0]["start"] == 95 + assert expired[0]["end"] == 105 + assert expired[1]["value"] == 1 + assert expired[1]["start"] == 100 + assert expired[1]["end"] == 110 diff --git a/tests/test_quixstreams/test_dataframe/test_windows/test_tumbling.py b/tests/test_quixstreams/test_dataframe/test_windows/test_tumbling.py index e255b55cf..3216d9d06 100644 --- a/tests/test_quixstreams/test_dataframe/test_windows/test_tumbling.py +++ b/tests/test_quixstreams/test_dataframe/test_windows/test_tumbling.py @@ -51,11 +51,11 @@ def test_tumblingwindow_count( store = state_manager.get_store(topic="test", store_name=window.name) store.assign_partition(0) with store.start_partition_transaction(0) as tx: - with tx.with_prefix(prefix=b"key"): - window.process_window(value=0, state=tx.state, timestamp_ms=100) - updated, expired = window.process_window( - value=0, state=tx.state, timestamp_ms=100 - ) + state = tx.as_state(prefix=b"key") + window.process_window(value=0, state=state, timestamp_ms=100) + updated, expired = window.process_window( + value=0, state=state, timestamp_ms=100 + ) assert len(updated) == 1 assert updated[0]["value"] == 2 assert not expired @@ -69,11 +69,11 @@ def test_tumblingwindow_sum( store = state_manager.get_store(topic="test", store_name=window.name) store.assign_partition(0) with store.start_partition_transaction(0) as tx: - with tx.with_prefix(prefix=b"key"): - window.process_window(value=2, state=tx.state, timestamp_ms=100) - updated, expired = window.process_window( - value=1, state=tx.state, timestamp_ms=100 - ) + state = tx.as_state(prefix=b"key") + window.process_window(value=2, state=state, timestamp_ms=100) + updated, expired = window.process_window( + value=1, state=state, timestamp_ms=100 + ) assert len(updated) == 1 assert updated[0]["value"] == 3 assert not expired @@ -87,11 +87,11 @@ def test_tumblingwindow_mean( store = state_manager.get_store(topic="test", store_name=window.name) store.assign_partition(0) with store.start_partition_transaction(0) as tx: - with tx.with_prefix(prefix=b"key"): - window.process_window(value=2, state=tx.state, timestamp_ms=100) - updated, expired = window.process_window( - value=1, state=tx.state, timestamp_ms=100 - ) + state = tx.as_state(prefix=b"key") + window.process_window(value=2, state=state, timestamp_ms=100) + updated, expired = window.process_window( + value=1, state=state, timestamp_ms=100 + ) assert len(updated) == 1 assert updated[0]["value"] == 1.5 assert not expired @@ -108,11 +108,11 @@ def test_tumblingwindow_reduce( store = state_manager.get_store(topic="test", store_name=window.name) store.assign_partition(0) with store.start_partition_transaction(0) as tx: - with tx.with_prefix(prefix=b"key"): - window.process_window(value=2, state=tx.state, timestamp_ms=100) - updated, expired = window.process_window( - value=1, state=tx.state, timestamp_ms=100 - ) + state = tx.as_state(prefix=b"key") + window.process_window(value=2, state=state, timestamp_ms=100) + updated, expired = window.process_window( + value=1, state=state, timestamp_ms=100 + ) assert len(updated) == 1 assert updated[0]["value"] == [2, 1] assert not expired @@ -126,11 +126,11 @@ def test_tumblingwindow_max( store = state_manager.get_store(topic="test", store_name=window.name) store.assign_partition(0) with store.start_partition_transaction(0) as tx: - with tx.with_prefix(prefix=b"key"): - window.process_window(value=2, state=tx.state, timestamp_ms=100) - updated, expired = window.process_window( - value=1, state=tx.state, timestamp_ms=100 - ) + state = tx.as_state(prefix=b"key") + window.process_window(value=2, state=state, timestamp_ms=100) + updated, expired = window.process_window( + value=1, state=state, timestamp_ms=100 + ) assert len(updated) == 1 assert updated[0]["value"] == 2 assert not expired @@ -144,11 +144,11 @@ def test_tumblingwindow_min( store = state_manager.get_store(topic="test", store_name=window.name) store.assign_partition(0) with store.start_partition_transaction(0) as tx: - with tx.with_prefix(prefix=b"key"): - window.process_window(value=2, state=tx.state, timestamp_ms=100) - updated, expired = window.process_window( - value=1, state=tx.state, timestamp_ms=100 - ) + state = tx.as_state(prefix=b"key") + window.process_window(value=2, state=state, timestamp_ms=100) + updated, expired = window.process_window( + value=1, state=state, timestamp_ms=100 + ) assert len(updated) == 1 assert updated[0]["value"] == 1 assert not expired @@ -182,28 +182,28 @@ def test_tumbling_window_process_window_expired( store = state_manager.get_store(topic="test", store_name=window.name) store.assign_partition(0) with store.start_partition_transaction(0) as tx: - with tx.with_prefix(prefix=b"key"): - # Add item to the window [100, 110) - updated, expired = window.process_window( - value=1, state=tx.state, timestamp_ms=100 - ) - assert len(updated) == 1 - assert updated[0]["value"] == 1 - assert updated[0]["start"] == 100 - assert updated[0]["end"] == 110 - assert not expired - - # Now add item to the window [110, 120) - # The window [100, 110) is now expired and should be returned - updated, expired = window.process_window( - value=2, state=tx.state, timestamp_ms=110 - ) - assert len(updated) == 1 - assert updated[0]["value"] == 2 - assert updated[0]["start"] == 110 - assert updated[0]["end"] == 120 - - assert len(expired) == 1 - assert expired[0]["value"] == 1 - assert expired[0]["start"] == 100 - assert expired[0]["end"] == 110 + state = tx.as_state(prefix=b"key") + # Add item to the window [100, 110) + updated, expired = window.process_window( + value=1, state=state, timestamp_ms=100 + ) + assert len(updated) == 1 + assert updated[0]["value"] == 1 + assert updated[0]["start"] == 100 + assert updated[0]["end"] == 110 + assert not expired + + # Now add item to the window [110, 120) + # The window [100, 110) is now expired and should be returned + updated, expired = window.process_window( + value=2, state=state, timestamp_ms=110 + ) + assert len(updated) == 1 + assert updated[0]["value"] == 2 + assert updated[0]["start"] == 110 + assert updated[0]["end"] == 120 + + assert len(expired) == 1 + assert expired[0]["value"] == 1 + assert expired[0]["start"] == 100 + assert expired[0]["end"] == 110 diff --git a/tests/test_quixstreams/test_state/test_manager.py b/tests/test_quixstreams/test_state/test_manager.py index 67982780d..1fd19b560 100644 --- a/tests/test_quixstreams/test_state/test_manager.py +++ b/tests/test_quixstreams/test_state/test_manager.py @@ -1,19 +1,15 @@ -import contextlib import os import uuid from unittest.mock import patch, call import pytest -import rocksdict from quixstreams.state.exceptions import ( StoreNotRegisteredError, - InvalidStoreTransactionStateError, PartitionStoreIsUsed, WindowedStoreAlreadyRegisteredError, ) from quixstreams.state.recovery import ChangelogProducerFactory -from quixstreams.state.rocksdb import RocksDBPartitionTransaction from tests.utils import TopicPartitionStub @@ -177,87 +173,6 @@ def test_clear_stores_fails(self, state_manager): with pytest.raises(PartitionStoreIsUsed): state_manager.clear_stores() - def test_store_transaction_success(self, state_manager): - state_manager.register_store("topic", "store") - tp = TopicPartitionStub("topic", 0) - state_manager.on_partition_assign(tp) - - store = state_manager.get_store("topic", "store") - store_partition = store.partitions[0] - - assert store_partition.get_processed_offset() is None - - with state_manager.start_store_transaction("topic", partition=0, offset=1): - tx = state_manager.get_store_transaction("store") - tx.set("some_key", "some_value") - - state_manager.on_partition_assign(tp) - - store = state_manager.get_store("topic", "store") - store_partition = store.partitions[0] - - assert store_partition.get_processed_offset() == 1 - - def test_store_transaction_no_flush_on_exception(self, state_manager): - state_manager.register_store("topic", "store") - state_manager.on_partition_assign(TopicPartitionStub("topic", 0)) - store = state_manager.get_store("topic", "store") - - with contextlib.suppress(Exception): - with state_manager.start_store_transaction("topic", partition=0, offset=1): - tx = state_manager.get_store_transaction("store") - tx.set("some_key", "some_value") - raise ValueError() - - store_partition = store.partitions[0] - assert store_partition.get_processed_offset() is None - - def test_store_transaction_no_flush_if_partition_transaction_failed( - self, state_manager - ): - """ - Ensure that no PartitionTransactions are flushed to the DB if - any of them fails - """ - state_manager.register_store("topic", "store1") - state_manager.register_store("topic", "store2") - state_manager.on_partition_assign(TopicPartitionStub("topic", 0)) - store1 = state_manager.get_store("topic", "store1") - store2 = state_manager.get_store("topic", "store2") - - with state_manager.start_store_transaction("topic", partition=0, offset=1): - tx_store1 = state_manager.get_store_transaction("store1") - tx_store2 = state_manager.get_store_transaction("store2") - # Simulate exception in one of the transactions - with contextlib.suppress(ValueError), patch.object( - RocksDBPartitionTransaction, - "_serialize_key", - side_effect=ValueError("test"), - ): - tx_store1.set("some_key", "some_value") - tx_store2.set("some_key", "some_value") - - assert store1.partitions[0].get_processed_offset() is None - assert store2.partitions[0].get_processed_offset() is None - - def test_get_store_transaction_store_not_registered_fails(self, state_manager): - with pytest.raises(StoreNotRegisteredError): - with state_manager.start_store_transaction("topic", 0, 0): - ... - - def test_get_store_transaction_not_started(self, state_manager): - with pytest.raises(InvalidStoreTransactionStateError): - state_manager.get_store_transaction("store") - - def test_start_store_transaction_already_started(self, state_manager): - state_manager.register_store("topic", "store") - with state_manager.start_store_transaction("topic", partition=0, offset=0): - with pytest.raises(InvalidStoreTransactionStateError): - with state_manager.start_store_transaction( - "topic", partition=0, offset=0 - ): - ... - class TestStateStoreManagerChangelog: def test_rebalance_partitions_stores_not_registered(self, state_manager_changelogs): @@ -334,71 +249,3 @@ def test_assign_revoke_partitions_stores_registered( for store in stores_list: assert not store.partitions - - def test_store_transaction_no_flush_on_exception( - self, - state_manager_changelogs, - ): - state_manager = state_manager_changelogs - recovery_manager = state_manager._recovery_manager - topic_manager = recovery_manager._topic_manager - producer = state_manager._producer - consumer = recovery_manager._consumer - - consumer.get_watermark_offsets.return_value = (0, 10) - topic_manager.topic(name="topic") - state_manager.register_store("topic", store_name="store") - state_manager.on_partition_assign(TopicPartitionStub("topic", 0)) - store = state_manager.get_store("topic", "store") - - with contextlib.suppress(Exception): - with state_manager.start_store_transaction("topic", partition=0, offset=1): - tx = state_manager.get_store_transaction("store") - tx.set("some_key", "some_value") - raise ValueError() - - store_partition = store.partitions[0] - assert store_partition.get_processed_offset() is None - assert store_partition.get_changelog_offset() is None - producer.produce.assert_not_called() - - def test_store_transaction_no_flush_if_partition_transaction_failed( - self, - state_manager_changelogs, - ): - """ - Ensure that no PartitionTransactions are flushed to the DB if - any of them fails - """ - state_manager = state_manager_changelogs - recovery_manager = state_manager._recovery_manager - topic_manager = recovery_manager._topic_manager - producer = state_manager._producer - consumer = recovery_manager._consumer - - consumer.get_watermark_offsets.return_value = (0, 10) - topic_manager.topic(name="topic") - state_manager.register_store("topic", store_name="store1") - state_manager.register_store("topic", store_name="store2") - state_manager.on_partition_assign(TopicPartitionStub("topic", 0)) - - store1 = state_manager.get_store("topic", "store1") - store2 = state_manager.get_store("topic", "store2") - - with state_manager.start_store_transaction("topic", partition=0, offset=1): - tx_store1 = state_manager.get_store_transaction("store1") - tx_store2 = state_manager.get_store_transaction("store2") - # Simulate exception in one of the transactions - with contextlib.suppress(ValueError), patch.object( - RocksDBPartitionTransaction, - "_serialize_key", - side_effect=ValueError("test"), - ): - tx_store1.set("some_key", "some_value") - tx_store2.set("some_key", "some_value") - - assert store1.partitions[0].get_processed_offset() is None - assert store1.partitions[0].get_changelog_offset() is None - assert store2.partitions[0].get_processed_offset() is None - assert store2.partitions[0].get_changelog_offset() is None - producer.produce.assert_not_called() diff --git a/tests/test_quixstreams/test_state/test_rocksdb/fixtures.py b/tests/test_quixstreams/test_state/test_rocksdb/fixtures.py index 44a207dd8..35f4e464f 100644 --- a/tests/test_quixstreams/test_state/test_rocksdb/fixtures.py +++ b/tests/test_quixstreams/test_state/test_rocksdb/fixtures.py @@ -1,41 +1,13 @@ import uuid from typing import Optional -from unittest.mock import create_autospec +from unittest.mock import create_autospec, MagicMock import pytest from quixstreams.state import ChangelogProducer, ChangelogProducerFactory from quixstreams.state.rocksdb import RocksDBStore from quixstreams.state.rocksdb.options import RocksDBOptions -from quixstreams.state.rocksdb.partition import ( - RocksDBStorePartition, -) - - -TEST_KEYS = [ - "string", - 123, - 123.123, - (123, 456), -] - -TEST_VALUES = [ - None, - "string", - 123, - 123.123, - {"key": "value", "mapping": {"key": "value"}}, - [123, 456], -] - -TEST_PREFIXES = [ - b"some_bytes", - "string", - 123, - 123.123, - (123, 456), - [123, 456], -] +from quixstreams.state.rocksdb.partition import RocksDBStorePartition @pytest.fixture() @@ -90,3 +62,8 @@ def rocksdb_store(rocksdb_store_factory) -> RocksDBStore: store = rocksdb_store_factory() yield store store.close() + + +@pytest.fixture() +def changelog_producer_mock(): + return MagicMock(spec_set=ChangelogProducer) diff --git a/tests/test_quixstreams/test_state/test_rocksdb/test_partition.py b/tests/test_quixstreams/test_state/test_rocksdb/test_partition.py index a1f718343..40fefbb69 100644 --- a/tests/test_quixstreams/test_state/test_rocksdb/test_partition.py +++ b/tests/test_quixstreams/test_state/test_rocksdb/test_partition.py @@ -177,8 +177,7 @@ def test_recover_from_changelog_message(self, rocksdb_partition, store_value): rocksdb_partition.recover_from_changelog_message(changelog_msg) with rocksdb_partition.begin() as tx: - with tx.with_prefix(kafka_key): - assert tx.get(user_store_key) == store_value + assert tx.get(user_store_key, prefix=kafka_key) == store_value assert rocksdb_partition.get_changelog_offset() == changelog_msg.offset() + 1 @pytest.mark.parametrize( diff --git a/tests/test_quixstreams/test_state/test_rocksdb/test_store.py b/tests/test_quixstreams/test_state/test_rocksdb/test_store.py index f82030f3c..adda48011 100644 --- a/tests/test_quixstreams/test_state/test_rocksdb/test_store.py +++ b/tests/test_quixstreams/test_state/test_rocksdb/test_store.py @@ -26,15 +26,16 @@ def test_revoke_partition_not_assigned(self, rocksdb_store): rocksdb_store.revoke_partition(0) def test_create_transaction(self, rocksdb_store): + prefix = b"__key__" rocksdb_store.assign_partition(0) with rocksdb_store.start_partition_transaction(0) as tx: - tx.set("key", "value") + tx.set("key", "value", prefix=prefix) rocksdb_store.revoke_partition(0) # Assign partition again and check the value rocksdb_store.assign_partition(0) with rocksdb_store.start_partition_transaction(0) as tx: - assert tx.get("key") == "value" + assert tx.get("key", prefix=prefix) == "value" assert rocksdb_store._changelog_producer_factory is None def test_get_transaction_partition_not_assigned(self, rocksdb_store): diff --git a/tests/test_quixstreams/test_state/test_rocksdb/test_transaction.py b/tests/test_quixstreams/test_state/test_rocksdb/test_transaction.py index 2a87cbae9..b41d3f6af 100644 --- a/tests/test_quixstreams/test_state/test_rocksdb/test_transaction.py +++ b/tests/test_quixstreams/test_state/test_rocksdb/test_transaction.py @@ -1,7 +1,7 @@ import contextlib import secrets from datetime import datetime -from unittest.mock import patch, call +from unittest.mock import patch import pytest import rocksdict @@ -18,7 +18,31 @@ ) from quixstreams.state.rocksdb.serialization import serialize from quixstreams.utils.json import dumps -from .fixtures import TEST_KEYS, TEST_VALUES, TEST_PREFIXES + +TEST_KEYS = [ + "string", + 123, + 123.123, + (123, 456), +] + +TEST_VALUES = [ + None, + "string", + 123, + 123.123, + {"key": "value", "mapping": {"key": "value"}}, + [123, 456], +] + +TEST_PREFIXES = [ + b"some_bytes", + "string", + 123, + 123.123, + (123, 456), + [123, 456], +] class TestRocksDBPartitionTransaction: @@ -28,136 +52,33 @@ def test_transaction_complete(self, rocksdb_partition): assert tx.completed - def test_transaction_with_changelog(self, rocksdb_partition): - changelog_producer = rocksdb_partition._changelog_producer - key_out = "my_key" - value_out = "my_value" - cf = "default" - db_writes = 3 - assert rocksdb_partition.get_changelog_offset() is None - - with rocksdb_partition.begin() as tx: - for i in range(db_writes): - tx.set(key=f"{key_out}{i}", value=f"{value_out}{i}", cf_name=cf) - - changelog_producer.produce.assert_has_calls( - [ - call( - key=tx._serialize_key(key=f"{key_out}{i}"), - value=tx._serialize_value(value=f"{value_out}{i}"), - headers={CHANGELOG_CF_MESSAGE_HEADER: cf}, - ) - for i in range(db_writes) - ] - ) - assert changelog_producer.produce.call_count == db_writes - assert tx.completed - assert rocksdb_partition.get_changelog_offset() == db_writes - - def test_transaction_with_changelog_delete(self, rocksdb_partition): - changelog_producer = rocksdb_partition._changelog_producer - key_out = "my_key" - value_out = "my_value" - cf = "default" - assert rocksdb_partition.get_changelog_offset() is None - - with rocksdb_partition.begin() as tx: - tx.set(key=key_out, value=value_out, cf_name=cf) - - with rocksdb_partition.begin() as tx: - tx.delete(key=key_out, cf_name=cf) - - changelog_producer.produce.assert_has_calls( - [ - call( - key=tx._serialize_key(key=key_out), - value=tx._serialize_value(value=value_out), - headers={CHANGELOG_CF_MESSAGE_HEADER: cf}, - ), - call( - key=tx._serialize_key(key=key_out), - value=None, - headers={CHANGELOG_CF_MESSAGE_HEADER: cf}, - ), - ] - ) - assert changelog_producer.produce.call_count == 2 - assert tx.completed - assert rocksdb_partition.get_changelog_offset() == 2 - - def test_transaction_with_changelog_delete_cached(self, rocksdb_partition): - changelog_producer = rocksdb_partition._changelog_producer - key_out = "my_key" - value_out = "my_value" - cf = "default" - db_writes = 3 - delete_index = 2 - assert rocksdb_partition.get_changelog_offset() is None - - with rocksdb_partition.begin() as tx: - for i in range(db_writes): - tx.set(key=f"{key_out}{i}", value=f"{value_out}{i}", cf_name=cf) - tx.delete(key=f"{key_out}{delete_index}", cf_name=cf) - - changelog_producer.produce.assert_has_calls( - [ - call( - key=tx._serialize_key(key=f"{key_out}{i}"), - value=tx._serialize_value(value=f"{value_out}{i}"), - headers={CHANGELOG_CF_MESSAGE_HEADER: cf}, - ) - for i in range(db_writes - 1) - ] - + [ - call( - key=tx._serialize_key(key=f"{key_out}{delete_index}"), - value=None, - headers={CHANGELOG_CF_MESSAGE_HEADER: cf}, - ) - ] - ) - assert changelog_producer.produce.call_count == db_writes - assert tx.completed - assert rocksdb_partition.get_changelog_offset() == db_writes - - def test_transaction_with_changelog_delete_nonexisting_key(self, rocksdb_partition): - changelog_producer = rocksdb_partition._changelog_producer - key_out = "my_key" - cf = "default" - assert rocksdb_partition.get_changelog_offset() is None - - with rocksdb_partition.begin() as tx: - tx.delete(key=key_out, cf_name=cf) - - changelog_producer.produce.assert_called_with( - key=tx._serialize_key(key=key_out), - value=None, - headers={CHANGELOG_CF_MESSAGE_HEADER: cf}, - ) - - assert tx.completed - assert rocksdb_partition.get_changelog_offset() == 1 - - def test_transaction_doesnt_write_empty_batch(self, rocksdb_partition): + def test_transaction_doesnt_write_empty_batch( + self, changelog_producer_mock, rocksdb_partition_factory + ): """ Test that transaction doesn't call "StateStore.write()" if the internal WriteBatch is empty (i.e. no keys were updated during the transaction). Writing empty batches costs more than doing """ - changelog_producer = rocksdb_partition._changelog_producer - with patch.object(RocksDBStorePartition, "write") as mocked: - with rocksdb_partition.begin() as tx: - tx.get("key") - with rocksdb_partition.begin() as tx: - tx.get("key") + prefix = b"__key__" + with rocksdb_partition_factory( + changelog_producer=changelog_producer_mock + ) as partition: + with patch.object(RocksDBStorePartition, "write") as mocked: + with partition.begin() as tx: + tx.get("key", prefix=prefix) + + with partition.begin() as tx: + tx.get("key", prefix=prefix) assert not mocked.called - assert not changelog_producer.produce.called + assert not changelog_producer_mock.produce.called def test_delete_key_doesnt_exist(self, rocksdb_partition): + prefix = b"__key__" with rocksdb_partition.begin() as tx: - tx.delete("key") + tx.delete("key", prefix=prefix) @pytest.mark.parametrize( "key", @@ -168,9 +89,10 @@ def test_delete_key_doesnt_exist(self, rocksdb_partition): TEST_VALUES, ) def test_get_key_exists_cached(self, key, value, rocksdb_partition): + prefix = b"__key__" with rocksdb_partition.begin() as tx: - tx.set(key, value) - stored = tx.get(key) + tx.set(key, value, prefix=prefix) + stored = tx.get(key, prefix=prefix) assert stored == value @pytest.mark.parametrize( @@ -182,65 +104,76 @@ def test_get_key_exists_cached(self, key, value, rocksdb_partition): TEST_VALUES, ) def test_get_key_exists_no_cache(self, key, value, rocksdb_partition): + prefix = b"__key__" with rocksdb_partition.begin() as tx: - tx.set(key, value) + tx.set(key, value, prefix=prefix) + with rocksdb_partition.begin() as tx: - stored = tx.get(key, value) + stored = tx.get(key, prefix=prefix) assert stored == value def test_get_key_doesnt_exist_default(self, rocksdb_partition): + prefix = b"__key__" with rocksdb_partition.begin() as tx: - value = tx.get("key", default=123) + value = tx.get("key", default=123, prefix=prefix) assert value == 123 def test_delete_key_cached_no_flush(self, rocksdb_partition): + prefix = b"__key__" with rocksdb_partition.begin() as tx: - tx.set("key", "value") - assert tx.get("key") == "value" - tx.delete("key") - assert tx.get("key") is None + tx.set("key", "value", prefix=prefix) + assert tx.get("key", prefix=prefix) == "value" + tx.delete("key", prefix=prefix) + assert tx.get("key", prefix=prefix) is None def test_delete_key_cached(self, rocksdb_partition): + prefix = b"__key__" with rocksdb_partition.begin() as tx: - tx.set("key", "value") + tx.set("key", "value", prefix=prefix) with rocksdb_partition.begin() as tx: - assert tx.get("key") == "value" - tx.delete("key") - assert tx.get("key") is None + assert tx.get("key", prefix=prefix) == "value" + tx.delete("key", prefix=prefix) + assert tx.get("key", prefix=prefix) is None def test_delete_key_no_cache(self, rocksdb_partition): + prefix = b"__key__" with rocksdb_partition.begin() as tx: - tx.set("key", "value") - assert tx.get("key") == "value" + tx.set("key", "value", prefix=prefix) + assert tx.get("key", prefix=prefix) == "value" with rocksdb_partition.begin() as tx: - tx.delete("key") + tx.delete("key", prefix=prefix) with rocksdb_partition.begin() as tx: - assert tx.get("key") is None + assert tx.get("key", prefix=prefix) is None def test_key_exists_cached(self, rocksdb_partition): + prefix = b"__key__" with rocksdb_partition.begin() as tx: - tx.set("key", "value") - assert tx.exists("key") - assert not tx.exists("key123") + tx.set("key", "value", prefix=prefix) + assert tx.exists("key", prefix=prefix) + assert not tx.exists("key123", prefix=prefix) def test_key_exists_no_cache(self, rocksdb_partition): + prefix = b"__key__" + with rocksdb_partition.begin() as tx: - tx.set("key", "value") + tx.set("key", "value", prefix=prefix) + with rocksdb_partition.begin() as tx: - assert tx.exists("key") - assert not tx.exists("key123") + assert tx.exists("key", prefix=prefix) + assert not tx.exists("key123", prefix=prefix) def test_key_exists_deleted_in_cache(self, rocksdb_partition): + prefix = b"__key__" with rocksdb_partition.begin() as tx: - tx.set("key", "value") + tx.set("key", "value", prefix=prefix) with rocksdb_partition.begin() as tx: - assert tx.exists("key") - tx.delete("key") - assert not tx.exists("key") + assert tx.exists("key", prefix=prefix) + tx.delete("key", prefix=prefix) + assert not tx.exists("key", prefix=prefix) @pytest.mark.parametrize( "key, value", @@ -252,15 +185,17 @@ def test_key_exists_deleted_in_cache(self, rocksdb_partition): ], ) def test_set_serialization_error(self, key, value, rocksdb_partition): + prefix = b"__key__" with rocksdb_partition.begin() as tx: with pytest.raises(StateSerializationError): - tx.set(key, value) + tx.set(key, value, prefix=prefix) @pytest.mark.parametrize("key", [object(), b"somebytes", datetime.utcnow()]) def test_delete_serialization_error(self, key, rocksdb_partition): + prefix = b"__key__" with rocksdb_partition.begin() as tx: with pytest.raises(StateSerializationError): - tx.delete(key) + tx.delete(key, prefix=prefix) def test_get_deserialization_error(self, rocksdb_partition): bytes_ = secrets.token_bytes(10) @@ -275,46 +210,33 @@ def test_get_deserialization_error(self, rocksdb_partition): with rocksdb_partition.begin() as tx: with pytest.raises(StateSerializationError): - tx.get(string_) + tx.get(string_, prefix=b"") with pytest.raises(StateSerializationError): - tx.get(bytes_) - - @pytest.mark.parametrize("prefix", TEST_PREFIXES) - def test_set_key_with_prefix_no_cache(self, prefix, rocksdb_partition): - with rocksdb_partition.begin() as tx: - with tx.with_prefix(prefix): - tx.set("key", "value") - - with rocksdb_partition.begin() as tx: - with tx.with_prefix(prefix): - assert tx.get("key") == "value" - - with rocksdb_partition.begin() as tx: - assert tx.get("key") is None - - @pytest.mark.parametrize("prefix", TEST_PREFIXES) - def test_delete_key_with_prefix_no_cache(self, prefix, rocksdb_partition): - with rocksdb_partition.begin() as tx: - with tx.with_prefix(prefix): - tx.set("key", "value") - - with rocksdb_partition.begin() as tx: - with tx.with_prefix(prefix): - assert tx.get("key") == "value" + tx.get(bytes_, prefix=b"") + def test_set_key_different_prefixes(self, rocksdb_partition): + prefix1, prefix2 = b"__key1__", b"__key2__" with rocksdb_partition.begin() as tx: - with tx.with_prefix(prefix): - tx.delete("key") + tx.set("key", "value", prefix=prefix1) + assert tx.get("key", prefix=prefix1) == "value" + assert tx.get("key", prefix=prefix2) is None + def test_delete_key_different_prefixes_no_cache(self, rocksdb_partition): + prefix1, prefix2 = b"__key1__", b"__key2__" with rocksdb_partition.begin() as tx: - with tx.with_prefix(prefix): - assert tx.get("key") is None + tx.set("key", "value", prefix=prefix1) + tx.set("key", "value", prefix=prefix2) + assert tx.get("key", prefix=prefix1) == "value" + assert tx.get("key", prefix=prefix2) == "value" + tx.delete("key", prefix=prefix1) + assert tx.get("key", prefix=prefix1) is None + assert tx.get("key", prefix=prefix2) is not None @pytest.mark.parametrize( "operation", [ - lambda tx: tx.set("key", "value"), - lambda tx: tx.delete("key"), + lambda tx, prefix: tx.set("key", "value", prefix=prefix), + lambda tx, prefix: tx.delete("key", prefix=prefix), ], ) def test_update_key_failed_transaction_failed(self, operation, rocksdb_partition): @@ -322,9 +244,8 @@ def test_update_key_failed_transaction_failed(self, operation, rocksdb_partition Test that if the update operation (set or delete) fails the transaction is marked as failed and cannot be re-used anymore. """ - # TODO: Test fails because writebatch is not used anymore during updates - # TODO: What's the point of this "failing?" - To not flush anything if one of transactions is incomplete on __exit__ - # Since now each update translates + + prefix = b"__key__" with patch.object( RocksDBPartitionTransaction, "_serialize_key", @@ -332,22 +253,22 @@ def test_update_key_failed_transaction_failed(self, operation, rocksdb_partition ): with rocksdb_partition.begin() as tx: with contextlib.suppress(ValueError): - operation(tx=tx) + operation(tx=tx, prefix=prefix) assert tx.failed # Ensure that Transaction cannot be used after it's failed with pytest.raises(StateTransactionError): - tx.set("key", "value") + tx.set("key", "value", prefix=prefix) with pytest.raises(StateTransactionError): - tx.get("key") + tx.get("key", prefix=prefix) with pytest.raises(StateTransactionError): - tx.delete("key") + tx.delete("key", prefix=prefix) with pytest.raises(StateTransactionError): - tx.exists("key") + tx.exists("key", prefix=prefix) with pytest.raises(StateTransactionError): tx.maybe_flush() @@ -360,11 +281,12 @@ def test_flush_failed_transaction_failed(self, rocksdb_partition): as failed and cannot be re-used anymore. """ + prefix = b"__key__" with patch.object( RocksDBStorePartition, "write", side_effect=ValueError("test") ): with rocksdb_partition.begin() as tx: - tx.set("key", "value") + tx.set("key", "value", prefix=prefix) with contextlib.suppress(ValueError): tx.maybe_flush() @@ -373,90 +295,227 @@ def test_flush_failed_transaction_failed(self, rocksdb_partition): # Ensure that Transaction cannot be used after it's failed with pytest.raises(StateTransactionError): - tx.set("key", "value") + tx.set("key", "value", prefix=prefix) with pytest.raises(StateTransactionError): - tx.get("key") + tx.get("key", prefix=prefix) with pytest.raises(StateTransactionError): - tx.delete("key") + tx.delete("key", prefix=prefix) with pytest.raises(StateTransactionError): - tx.exists("key") + tx.exists("key", prefix=prefix) assert tx.completed def test_transaction_not_flushed_on_error(self, rocksdb_partition): + prefix = b"__key__" with contextlib.suppress(ValueError): with rocksdb_partition.begin() as tx: - tx.set("key", "value") + tx.set("key", "value", prefix=prefix) raise ValueError("test") with rocksdb_partition.begin() as tx: - assert tx.get("key") is None + assert tx.get("key", prefix=prefix) is None def test_custom_dumps_loads(self, rocksdb_partition_factory): key = secrets.token_bytes(10) value = secrets.token_bytes(10) + prefix = b"__key__" with rocksdb_partition_factory( options=RocksDBOptions(loads=lambda v: v, dumps=lambda v: v) ) as db: with db.begin() as tx: - tx.set(key, value) + tx.set(key, value, prefix=prefix) with db.begin() as tx: - assert tx.get(key) == value + assert tx.get(key, prefix=prefix) == value def test_set_dict_nonstr_keys_fails(self, rocksdb_partition): key = "key" value = {0: 1} + prefix = b"__key__" with rocksdb_partition.begin() as tx: with pytest.raises(StateSerializationError): - tx.set(key, value) + tx.set(key, value, prefix=prefix) def test_set_datetime_fails(self, rocksdb_partition): key = "key" value = datetime.utcnow() + prefix = b"__key__" with rocksdb_partition.begin() as tx: with pytest.raises(StateSerializationError): - tx.set(key, value) + tx.set(key, value, prefix=prefix) def test_set_get_with_column_family(self, rocksdb_partition): key = "key" value = "value" + prefix = b"__key__" rocksdb_partition.create_column_family("cf") with rocksdb_partition.begin() as tx: - tx.set(key, value, cf_name="cf") - assert tx.get(key, cf_name="cf") == value + tx.set(key, value, cf_name="cf", prefix=prefix) + assert tx.get(key, cf_name="cf", prefix=prefix) == value with rocksdb_partition.begin() as tx: - assert tx.get(key, cf_name="cf") == value + assert tx.get(key, cf_name="cf", prefix=prefix) == value def test_set_delete_get_with_column_family(self, rocksdb_partition): key = "key" value = "value" + prefix = b"__key__" rocksdb_partition.create_column_family("cf") with rocksdb_partition.begin() as tx: - tx.set(key, value, cf_name="cf") - assert tx.get(key, cf_name="cf") == value - tx.delete(key, cf_name="cf") - assert tx.get(key, cf_name="cf") is None + tx.set(key, value, cf_name="cf", prefix=prefix) + assert tx.get(key, cf_name="cf", prefix=prefix) == value + tx.delete(key, cf_name="cf", prefix=prefix) + assert tx.get(key, cf_name="cf", prefix=prefix) is None with rocksdb_partition.begin() as tx: - assert tx.get(key, cf_name="cf") is None + assert tx.get(key, cf_name="cf", prefix=prefix) is None def test_set_exists_get_with_column_family(self, rocksdb_partition): key = "key" value = "value" rocksdb_partition.create_column_family("cf") + prefix = b"__key__" with rocksdb_partition.begin() as tx: - assert not tx.exists(key, cf_name="cf") - tx.set(key, value, cf_name="cf") - assert tx.exists(key, cf_name="cf") + assert not tx.exists(key, cf_name="cf", prefix=prefix) + tx.set(key, value, cf_name="cf", prefix=prefix) + assert tx.exists(key, cf_name="cf", prefix=prefix) with rocksdb_partition.begin() as tx: - assert tx.exists(key, cf_name="cf") + assert tx.exists(key, cf_name="cf", prefix=prefix) + + +class TestRocksDBPartitionTransactionChangelog: + def test_transaction_with_changelog_set( + self, rocksdb_partition_factory, changelog_producer_mock + ): + + data = [ + ("key1", "value1"), + ("key2", "value2"), + ("key3", "value3"), + ] + cf = "default" + prefix = b"__key__" + + with rocksdb_partition_factory( + changelog_producer=changelog_producer_mock + ) as partition: + assert partition.get_changelog_offset() is None + + with partition.begin() as tx: + for key, value in data: + tx.set( + key=key, + value=value, + cf_name=cf, + prefix=prefix, + ) + + assert changelog_producer_mock.produce.call_count == len(data) + for (key, value), call in zip( + data, changelog_producer_mock.produce.call_args_list + ): + assert call.kwargs["key"] == tx._serialize_key(key=key, prefix=prefix) + assert call.kwargs["value"] == tx._serialize_value(value=value) + assert call.kwargs["headers"] == {CHANGELOG_CF_MESSAGE_HEADER: cf} + + assert tx.completed + assert partition.get_changelog_offset() == len(data) + + def test_transaction_with_changelog_delete( + self, rocksdb_partition_factory, changelog_producer_mock + ): + key, value = "key", "value" + cf = "default" + prefix = b"__key__" + with rocksdb_partition_factory( + changelog_producer=changelog_producer_mock + ) as partition: + + assert partition.get_changelog_offset() is None + + with partition.begin() as tx: + tx.set(key=key, value=value, cf_name=cf, prefix=prefix) + + with partition.begin() as tx: + tx.delete(key=key, cf_name=cf, prefix=prefix) + + assert partition.get_changelog_offset() == 2 + assert changelog_producer_mock.produce.call_count == 2 + + set_changelog = changelog_producer_mock.produce.call_args_list[0] + assert set_changelog.kwargs["key"] == tx._serialize_key(key=key, prefix=prefix) + assert set_changelog.kwargs["value"] == tx._serialize_value(value=value) + assert set_changelog.kwargs["headers"] == {CHANGELOG_CF_MESSAGE_HEADER: cf} + + delete_changelog = changelog_producer_mock.produce.call_args_list[1] + assert delete_changelog.kwargs["key"] == tx._serialize_key( + key=key, prefix=prefix + ) + assert delete_changelog.kwargs["value"] is None + assert delete_changelog.kwargs["headers"] == {CHANGELOG_CF_MESSAGE_HEADER: cf} + + def test_transaction_with_changelog_delete_cached( + self, rocksdb_partition_factory, changelog_producer_mock + ): + """ + Test that only "delete" changelog message is emited if the key is set + and deleted in the same transaction. + """ + key, value = "key", "value" + cf = "default" + prefix = b"__key__" + + with rocksdb_partition_factory( + changelog_producer=changelog_producer_mock + ) as partition: + + assert partition.get_changelog_offset() is None + + with partition.begin() as tx: + tx.set(key=key, value=value, cf_name=cf, prefix=prefix) + tx.delete(key=key, cf_name=cf, prefix=prefix) + + assert changelog_producer_mock.produce.call_count == 1 + delete_changelog = changelog_producer_mock.produce.call_args_list[0] + assert delete_changelog.kwargs["key"] == tx._serialize_key( + key=key, prefix=prefix + ) + assert delete_changelog.kwargs["value"] is None + assert delete_changelog.kwargs["headers"] == { + CHANGELOG_CF_MESSAGE_HEADER: cf + } + + assert tx.completed + assert partition.get_changelog_offset() == 1 + + def test_transaction_with_changelog_delete_nonexisting_key( + self, rocksdb_partition_factory, changelog_producer_mock + ): + key = "key" + cf = "default" + prefix = b"__key__" + + with rocksdb_partition_factory( + changelog_producer=changelog_producer_mock + ) as partition: + + assert partition.get_changelog_offset() is None + + with partition.begin() as tx: + tx.delete(key=key, cf_name=cf, prefix=prefix) + assert tx.completed + assert partition.get_changelog_offset() == 1 + + changelog_producer_mock.produce.assert_called_with( + key=tx._serialize_key(key=key, prefix=prefix), + value=None, + headers={CHANGELOG_CF_MESSAGE_HEADER: cf}, + ) diff --git a/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/test_partition.py b/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/test_partition.py index 2c7679e36..78ac3290e 100644 --- a/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/test_partition.py +++ b/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/test_partition.py @@ -38,12 +38,11 @@ def test_recover_window_from_changelog_message( ) store_partition.recover_from_changelog_message(changelog_msg) - with store_partition.begin() as tx: - with tx.with_prefix(kafka_key): - assert ( - tx.get_window(window["start_ms"], window["end_ms"]) == store_value - ) + assert ( + tx.get_window(window["start_ms"], window["end_ms"], prefix=kafka_key) + == store_value + ) assert store_partition.get_changelog_offset() == changelog_msg.offset() + 1 def test_recover_latest_expire_from_changelog_message( @@ -70,12 +69,12 @@ def test_recover_latest_expire_from_changelog_message( store_partition.recover_from_changelog_message(changelog_msg) with store_partition.begin() as tx: - with tx.with_prefix(kafka_key): - assert ( - tx.get( - LATEST_EXPIRED_WINDOW_TIMESTAMP_KEY, - cf_name=LATEST_EXPIRED_WINDOW_CF_NAME, - ) - == store_value + assert ( + tx.get( + LATEST_EXPIRED_WINDOW_TIMESTAMP_KEY, + cf_name=LATEST_EXPIRED_WINDOW_CF_NAME, + prefix=kafka_key, ) + == store_value + ) assert store_partition.get_changelog_offset() == changelog_msg.offset() + 1 diff --git a/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/test_state.py b/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/test_state.py new file mode 100644 index 000000000..b48740faf --- /dev/null +++ b/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/test_state.py @@ -0,0 +1,56 @@ +class TestWindowedRocksDBPartitionTransactionState: + def test_update_window(self, windowed_rocksdb_store_factory): + store = windowed_rocksdb_store_factory() + store.assign_partition(0) + prefix = b"__key__" + with store.start_partition_transaction(0) as tx: + state = tx.as_state(prefix=prefix) + state.update_window(start_ms=0, end_ms=10, value=1, timestamp_ms=2) + assert state.get_window(start_ms=0, end_ms=10) == 1 + + with store.start_partition_transaction(0) as tx: + state = tx.as_state(prefix=prefix) + assert state.get_window(start_ms=0, end_ms=10) == 1 + + def test_expire_windows(self, windowed_rocksdb_store_factory): + store = windowed_rocksdb_store_factory() + store.assign_partition(0) + prefix = b"__key__" + with store.start_partition_transaction(0) as tx: + state = tx.as_state(prefix=prefix) + state.update_window(start_ms=0, end_ms=10, value=1, timestamp_ms=2) + state.update_window(start_ms=10, end_ms=20, value=2, timestamp_ms=10) + + with store.start_partition_transaction(0) as tx: + state = tx.as_state(prefix=prefix) + state.update_window(start_ms=20, end_ms=30, value=3, timestamp_ms=20) + expired = state.expire_windows(duration_ms=10) + # "expire_windows" must update the expiration index so that the same + # windows are not expired twice + assert not state.expire_windows(duration_ms=10) + + assert len(expired) == 2 + assert expired == [ + ((0, 10), 1), + ((10, 20), 2), + ] + + with store.start_partition_transaction(0) as tx: + state = tx.as_state(prefix=prefix) + assert state.get_window(start_ms=0, end_ms=10) is None + assert state.get_window(start_ms=10, end_ms=20) is None + assert state.get_window(start_ms=20, end_ms=30) == 3 + + def test_get_latest_timestamp(self, windowed_rocksdb_store_factory): + store = windowed_rocksdb_store_factory() + partition = store.assign_partition(0) + timestamp = 123 + prefix = b"__key__" + with partition.begin() as tx: + state = tx.as_state(prefix) + state.update_window(0, 10, value=1, timestamp_ms=timestamp) + store.revoke_partition(0) + + partition = store.assign_partition(0) + with partition.begin() as tx: + assert tx.get_latest_timestamp() == timestamp diff --git a/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/test_transaction.py b/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/test_transaction.py index 2cc87a62d..af3f0c507 100644 --- a/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/test_transaction.py +++ b/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/test_transaction.py @@ -17,50 +17,57 @@ class TestWindowedRocksDBPartitionTransaction: def test_update_window(self, windowed_rocksdb_store_factory): store = windowed_rocksdb_store_factory() store.assign_partition(0) + prefix = b"__key__" with store.start_partition_transaction(0) as tx: - with tx.with_prefix(b"__key__"): - tx.update_window(start_ms=0, end_ms=10, value=1, timestamp_ms=2) - assert tx.get_window(start_ms=0, end_ms=10) == 1 + tx.update_window( + start_ms=0, end_ms=10, value=1, timestamp_ms=2, prefix=prefix + ) + assert tx.get_window(start_ms=0, end_ms=10, prefix=prefix) == 1 with store.start_partition_transaction(0) as tx: - with tx.with_prefix(b"__key__"): - assert tx.get_window(start_ms=0, end_ms=10) == 1 + assert tx.get_window(start_ms=0, end_ms=10, prefix=prefix) == 1 def test_get_window_doesnt_exist(self, windowed_rocksdb_store_factory): store = windowed_rocksdb_store_factory() store.assign_partition(0) + prefix = b"__key__" with store.start_partition_transaction(0) as tx: - with tx.with_prefix(b"__key__"): - assert tx.get_window(start_ms=0, end_ms=10) is None + assert tx.get_window(start_ms=0, end_ms=10, prefix=prefix) is None def test_delete_window(self, windowed_rocksdb_store_factory): store = windowed_rocksdb_store_factory() store.assign_partition(0) + prefix = b"__key__" with store.start_partition_transaction(0) as tx: - with tx.with_prefix(b"__key__"): - tx.update_window(start_ms=0, end_ms=10, value=1, timestamp_ms=1) - assert tx.get_window(start_ms=0, end_ms=10) == 1 - tx.delete_window(start_ms=0, end_ms=10) + tx.update_window( + start_ms=0, end_ms=10, value=1, timestamp_ms=1, prefix=prefix + ) + assert tx.get_window(start_ms=0, end_ms=10, prefix=prefix) == 1 + tx.delete_window(start_ms=0, end_ms=10, prefix=prefix) with store.start_partition_transaction(0) as tx: - with tx.with_prefix(b"__key__"): - assert tx.get_window(start_ms=0, end_ms=10) is None + assert tx.get_window(start_ms=0, end_ms=10, prefix=prefix) is None def test_expire_windows_expired(self, windowed_rocksdb_store_factory): store = windowed_rocksdb_store_factory() store.assign_partition(0) + prefix = b"__key__" with store.start_partition_transaction(0) as tx: - with tx.with_prefix(b"__key__"): - tx.update_window(start_ms=0, end_ms=10, value=1, timestamp_ms=2) - tx.update_window(start_ms=10, end_ms=20, value=2, timestamp_ms=10) + tx.update_window( + start_ms=0, end_ms=10, value=1, timestamp_ms=2, prefix=prefix + ) + tx.update_window( + start_ms=10, end_ms=20, value=2, timestamp_ms=10, prefix=prefix + ) with store.start_partition_transaction(0) as tx: - with tx.with_prefix(b"__key__"): - tx.update_window(start_ms=20, end_ms=30, value=3, timestamp_ms=20) - expired = tx.expire_windows(duration_ms=10) - # "expire_windows" must update the expiration index so that the same - # windows are not expired twice - assert not tx.expire_windows(duration_ms=10) + tx.update_window( + start_ms=20, end_ms=30, value=3, timestamp_ms=20, prefix=prefix + ) + expired = tx.expire_windows(duration_ms=10, prefix=prefix) + # "expire_windows" must update the expiration index so that the same + # windows are not expired twice + assert not tx.expire_windows(duration_ms=10, prefix=prefix) assert len(expired) == 2 assert expired == [ @@ -69,10 +76,9 @@ def test_expire_windows_expired(self, windowed_rocksdb_store_factory): ] with store.start_partition_transaction(0) as tx: - with tx.with_prefix(b"__key__"): - assert tx.get_window(start_ms=0, end_ms=10) is None - assert tx.get_window(start_ms=10, end_ms=20) is None - assert tx.get_window(start_ms=20, end_ms=30) == 3 + assert tx.get_window(start_ms=0, end_ms=10, prefix=prefix) is None + assert tx.get_window(start_ms=10, end_ms=20, prefix=prefix) is None + assert tx.get_window(start_ms=20, end_ms=30, prefix=prefix) == 3 def test_expire_windows_cached(self, windowed_rocksdb_store_factory): """ @@ -81,48 +87,62 @@ def test_expire_windows_cached(self, windowed_rocksdb_store_factory): """ store = windowed_rocksdb_store_factory() store.assign_partition(0) + prefix = b"__key__" with store.start_partition_transaction(0) as tx: - with tx.with_prefix(b"__key__"): - tx.update_window(start_ms=0, end_ms=10, value=1, timestamp_ms=2) - tx.update_window(start_ms=10, end_ms=20, value=2, timestamp_ms=10) - tx.update_window(start_ms=20, end_ms=30, value=3, timestamp_ms=20) - expired = tx.expire_windows(duration_ms=10) - # "expire_windows" must update the expiration index so that the same - # windows are not expired twice - assert not tx.expire_windows(duration_ms=10) - assert len(expired) == 2 - assert expired == [ - ((0, 10), 1), - ((10, 20), 2), - ] - assert tx.get_window(start_ms=0, end_ms=10) is None - assert tx.get_window(start_ms=10, end_ms=20) is None - assert tx.get_window(start_ms=20, end_ms=30) == 3 + tx.update_window( + start_ms=0, end_ms=10, value=1, timestamp_ms=2, prefix=prefix + ) + tx.update_window( + start_ms=10, end_ms=20, value=2, timestamp_ms=10, prefix=prefix + ) + tx.update_window( + start_ms=20, end_ms=30, value=3, timestamp_ms=20, prefix=prefix + ) + expired = tx.expire_windows(duration_ms=10, prefix=prefix) + # "expire_windows" must update the expiration index so that the same + # windows are not expired twice + assert not tx.expire_windows(duration_ms=10, prefix=prefix) + assert len(expired) == 2 + assert expired == [ + ((0, 10), 1), + ((10, 20), 2), + ] + assert tx.get_window(start_ms=0, end_ms=10, prefix=prefix) is None + assert tx.get_window(start_ms=10, end_ms=20, prefix=prefix) is None + assert tx.get_window(start_ms=20, end_ms=30, prefix=prefix) == 3 def test_expire_windows_empty(self, windowed_rocksdb_store_factory): store = windowed_rocksdb_store_factory() store.assign_partition(0) + prefix = b"__key__" with store.start_partition_transaction(0) as tx: - with tx.with_prefix(b"__key__"): - tx.update_window(start_ms=0, end_ms=10, value=1, timestamp_ms=2) - tx.update_window(start_ms=0, end_ms=10, value=1, timestamp_ms=2) + tx.update_window( + start_ms=0, end_ms=10, value=1, timestamp_ms=2, prefix=prefix + ) + tx.update_window( + start_ms=0, end_ms=10, value=1, timestamp_ms=2, prefix=prefix + ) with store.start_partition_transaction(0) as tx: - with tx.with_prefix(b"__key__"): - tx.update_window(start_ms=3, end_ms=13, value=1, timestamp_ms=3) - assert not tx.expire_windows(duration_ms=10) + tx.update_window( + start_ms=3, end_ms=13, value=1, timestamp_ms=3, prefix=prefix + ) + assert not tx.expire_windows(duration_ms=10, prefix=prefix) def test_expire_windows_with_grace_expired(self, windowed_rocksdb_store_factory): store = windowed_rocksdb_store_factory() store.assign_partition(0) + prefix = b"__key__" with store.start_partition_transaction(0) as tx: - with tx.with_prefix(b"__key__"): - tx.update_window(start_ms=0, end_ms=10, value=1, timestamp_ms=2) + tx.update_window( + start_ms=0, end_ms=10, value=1, timestamp_ms=2, prefix=prefix + ) with store.start_partition_transaction(0) as tx: - with tx.with_prefix(b"__key__"): - tx.update_window(start_ms=15, end_ms=25, value=1, timestamp_ms=15) - expired = tx.expire_windows(duration_ms=10, grace_ms=5) + tx.update_window( + start_ms=15, end_ms=25, value=1, timestamp_ms=15, prefix=prefix + ) + expired = tx.expire_windows(duration_ms=10, grace_ms=5, prefix=prefix) assert len(expired) == 1 assert expired == [((0, 10), 1)] @@ -130,14 +150,17 @@ def test_expire_windows_with_grace_expired(self, windowed_rocksdb_store_factory) def test_expire_windows_with_grace_empty(self, windowed_rocksdb_store_factory): store = windowed_rocksdb_store_factory() store.assign_partition(0) + prefix = b"__key__" with store.start_partition_transaction(0) as tx: - with tx.with_prefix(b"__key__"): - tx.update_window(start_ms=0, end_ms=10, value=1, timestamp_ms=2) + tx.update_window( + start_ms=0, end_ms=10, value=1, timestamp_ms=2, prefix=prefix + ) with store.start_partition_transaction(0) as tx: - with tx.with_prefix(b"__key__"): - tx.update_window(start_ms=13, end_ms=23, value=1, timestamp_ms=13) - expired = tx.expire_windows(duration_ms=10, grace_ms=5) + tx.update_window( + start_ms=13, end_ms=23, value=1, timestamp_ms=13, prefix=prefix + ) + expired = tx.expire_windows(duration_ms=10, grace_ms=5, prefix=prefix) assert not expired @@ -153,9 +176,10 @@ def test_get_window_invalid_duration( ): store = windowed_rocksdb_store_factory() store.assign_partition(0) + prefix = b"__key__" with store.start_partition_transaction(0) as tx: with pytest.raises(ValueError, match="Invalid window duration"): - tx.get_window(start_ms=start_ms, end_ms=end_ms) + tx.get_window(start_ms=start_ms, end_ms=end_ms, prefix=prefix) @pytest.mark.parametrize( "start_ms, end_ms", @@ -169,10 +193,15 @@ def test_update_window_invalid_duration( ): store = windowed_rocksdb_store_factory() store.assign_partition(0) + prefix = b"__key__" with store.start_partition_transaction(0) as tx: with pytest.raises(ValueError, match="Invalid window duration"): tx.update_window( - start_ms=start_ms, end_ms=end_ms, value=1, timestamp_ms=1 + start_ms=start_ms, + end_ms=end_ms, + value=1, + timestamp_ms=1, + prefix=prefix, ) @pytest.mark.parametrize( @@ -187,39 +216,50 @@ def test_delete_window_invalid_duration( ): store = windowed_rocksdb_store_factory() store.assign_partition(0) + prefix = b"__key__" with store.start_partition_transaction(0) as tx: with pytest.raises(ValueError, match="Invalid window duration"): - tx.delete_window(start_ms=start_ms, end_ms=end_ms) + tx.delete_window(start_ms=start_ms, end_ms=end_ms, prefix=prefix) def test_expire_windows_no_expired(self, windowed_rocksdb_store_factory): store = windowed_rocksdb_store_factory() store.assign_partition(0) + prefix = b"__key__" with store.start_partition_transaction(0) as tx: - with tx.with_prefix(b"__key__"): - tx.update_window(start_ms=0, end_ms=10, value=1, timestamp_ms=2) + tx.update_window( + start_ms=0, end_ms=10, value=1, timestamp_ms=2, prefix=prefix + ) with store.start_partition_transaction(0) as tx: - with tx.with_prefix(b"__key__"): - tx.update_window(start_ms=1, end_ms=11, value=1, timestamp_ms=9) - # "expire_windows" must update the expiration index so that the same - # windows are not expired twice - assert not tx.expire_windows(duration_ms=10) + tx.update_window( + start_ms=1, end_ms=11, value=1, timestamp_ms=9, prefix=prefix + ) + # "expire_windows" must update the expiration index so that the same + # windows are not expired twice + assert not tx.expire_windows(duration_ms=10, prefix=prefix) def test_expire_windows_multiple_windows(self, windowed_rocksdb_store_factory): store = windowed_rocksdb_store_factory() store.assign_partition(0) + prefix = b"__key__" with store.start_partition_transaction(0) as tx: - with tx.with_prefix(b"__key__"): - tx.update_window(start_ms=0, end_ms=10, value=1, timestamp_ms=2) - tx.update_window(start_ms=10, end_ms=20, value=1, timestamp_ms=11) - tx.update_window(start_ms=20, end_ms=30, value=1, timestamp_ms=21) + tx.update_window( + start_ms=0, end_ms=10, value=1, timestamp_ms=2, prefix=prefix + ) + tx.update_window( + start_ms=10, end_ms=20, value=1, timestamp_ms=11, prefix=prefix + ) + tx.update_window( + start_ms=20, end_ms=30, value=1, timestamp_ms=21, prefix=prefix + ) with store.start_partition_transaction(0) as tx: - with tx.with_prefix(b"__key__"): - tx.update_window(start_ms=30, end_ms=40, value=1, timestamp_ms=31) - # "expire_windows" must update the expiration index so that the same - # windows are not expired twice - expired = tx.expire_windows(duration_ms=10) + tx.update_window( + start_ms=30, end_ms=40, value=1, timestamp_ms=31, prefix=prefix + ) + # "expire_windows" must update the expiration index so that the same + # windows are not expired twice + expired = tx.expire_windows(duration_ms=10, prefix=prefix) assert len(expired) == 3 assert expired[0] == ((0, 10), 1) @@ -244,8 +284,9 @@ def test_get_latest_timestamp_update(self, windowed_rocksdb_store_factory): store = windowed_rocksdb_store_factory() partition = store.assign_partition(0) timestamp = 123 + prefix = b"__key__" with partition.begin() as tx: - tx.update_window(0, 10, value=1, timestamp_ms=timestamp) + tx.update_window(0, 10, value=1, timestamp_ms=timestamp, prefix=prefix) with partition.begin() as tx: assert tx.get_latest_timestamp() == timestamp @@ -254,8 +295,9 @@ def test_get_latest_timestamp_loaded_from_db(self, windowed_rocksdb_store_factor store = windowed_rocksdb_store_factory() partition = store.assign_partition(0) timestamp = 123 + prefix = b"__key__" with partition.begin() as tx: - tx.update_window(0, 10, value=1, timestamp_ms=timestamp) + tx.update_window(0, 10, value=1, timestamp_ms=timestamp, prefix=prefix) store.revoke_partition(0) partition = store.assign_partition(0) @@ -268,9 +310,10 @@ def test_get_latest_timestamp_cannot_go_backwards( store = windowed_rocksdb_store_factory() partition = store.assign_partition(0) timestamp = 9 + prefix = b"__key__" with partition.begin() as tx: - tx.update_window(0, 10, value=1, timestamp_ms=timestamp) - tx.update_window(0, 10, value=1, timestamp_ms=timestamp - 1) + tx.update_window(0, 10, value=1, timestamp_ms=timestamp, prefix=prefix) + tx.update_window(0, 10, value=1, timestamp_ms=timestamp - 1, prefix=prefix) assert tx.get_latest_timestamp() == timestamp with partition.begin() as tx: @@ -283,25 +326,31 @@ def test_update_window(self, windowed_rocksdb_store_factory_changelog): partition_num = 0 store_partition = store.assign_partition(partition_num) producer = store_partition._changelog_producer._producer - key = b"__key__" + prefix = b"__key__" start_ms = 0 end_ms = 10 value = 1 with store.start_partition_transaction(partition_num) as tx: - with tx.with_prefix(key): - expected_produced_key = tx._serialize_key( - encode_window_key(start_ms, end_ms) - ) - expected_produced_value = tx._serialize_value(value) - tx.update_window( - start_ms=start_ms, end_ms=end_ms, value=value, timestamp_ms=2 - ) - assert tx.get_window(start_ms=start_ms, end_ms=end_ms) == value + expected_produced_key = tx._serialize_key( + encode_window_key(start_ms, end_ms), prefix=prefix + ) + expected_produced_value = tx._serialize_value(value) + tx.update_window( + start_ms=start_ms, + end_ms=end_ms, + value=value, + timestamp_ms=2, + prefix=prefix, + ) + assert ( + tx.get_window(start_ms=start_ms, end_ms=end_ms, prefix=prefix) == value + ) with store.start_partition_transaction(partition_num) as tx: - with tx.with_prefix(key): - assert tx.get_window(start_ms=start_ms, end_ms=end_ms) == value + assert ( + tx.get_window(start_ms=start_ms, end_ms=end_ms, prefix=prefix) == value + ) assert ( store_partition.get_changelog_offset() == producer.produce.call_count == 1 @@ -319,28 +368,26 @@ def test_delete_window(self, windowed_rocksdb_store_factory_changelog): partition_num = 0 store_partition = store.assign_partition(partition_num) producer = store_partition._changelog_producer._producer - key = b"__key__" + prefix = b"__key__" expected_produced_value = None start_ms = 0 end_ms = 10 with store.start_partition_transaction(partition_num) as tx: - with tx.with_prefix(key): - expected_produced_key = tx._serialize_key( - encode_window_key(start_ms, end_ms) - ) - tx.update_window( - start_ms=start_ms, end_ms=end_ms, value=1, timestamp_ms=1 - ) - assert tx.get_window(start_ms=start_ms, end_ms=end_ms) == 1 - tx.delete_window(start_ms=start_ms, end_ms=end_ms) + expected_produced_key = tx._serialize_key( + encode_window_key(start_ms, end_ms), prefix=prefix + ) + tx.update_window( + start_ms=start_ms, end_ms=end_ms, value=1, timestamp_ms=1, prefix=prefix + ) + assert tx.get_window(start_ms=start_ms, end_ms=end_ms, prefix=prefix) == 1 + tx.delete_window(start_ms=start_ms, end_ms=end_ms, prefix=prefix) with store.start_partition_transaction(partition_num) as tx: - with tx.with_prefix(key): - assert ( - tx.get_window(start_ms=start_ms, end_ms=end_ms) - is expected_produced_value - ) + assert ( + tx.get_window(start_ms=start_ms, end_ms=end_ms, prefix=prefix) + is expected_produced_value + ) assert ( store_partition.get_changelog_offset() == producer.produce.call_count == 1 @@ -358,7 +405,7 @@ def test_expire_windows_expired(self, windowed_rocksdb_store_factory_changelog): partition_num = 0 store_partition = store.assign_partition(partition_num) producer = store_partition._changelog_producer._producer - key = b"__key__" + prefix = b"__key__" expected_update_produce_keys = [] expected_update_produce_values = [] expected_expired_window_keys = [] @@ -369,40 +416,33 @@ def test_expire_windows_expired(self, windowed_rocksdb_store_factory_changelog): # update windows, which will become expired later with store.start_partition_transaction(partition_num) as tx: - with tx.with_prefix(key): - for kwargs in expected_expired_windows: - serialized_key = tx._serialize_key( - encode_window_key( - start_ms=kwargs["start_ms"], end_ms=kwargs["end_ms"] - ) - ) - expected_update_produce_keys.append(serialized_key) - expected_expired_window_keys.append(serialized_key) - expected_update_produce_values.append( - tx._serialize_value(kwargs["value"]) - ) - tx.update_window(**kwargs) - - # add new window update, which expires previous windows - with store.start_partition_transaction(partition_num) as tx: - with tx.with_prefix(key): - kwargs = dict(start_ms=20, end_ms=30, value=3, timestamp_ms=20) - expected_update_produce_keys.append( - tx._serialize_key( - encode_window_key( - start_ms=kwargs["start_ms"], end_ms=kwargs["end_ms"] - ) - ) + for kwargs in expected_expired_windows: + serialized_key = tx._serialize_key( + encode_window_key(kwargs["start_ms"], kwargs["end_ms"]), + prefix=prefix, ) + expected_update_produce_keys.append(serialized_key) + expected_expired_window_keys.append(serialized_key) expected_update_produce_values.append( tx._serialize_value(kwargs["value"]) ) - tx.update_window(**kwargs) - expired = tx.expire_windows(duration_ms=10) - print(expired) - # "expire_windows" must update the expiration index so that the same - # windows are not expired twice - assert not tx.expire_windows(duration_ms=10) + tx.update_window(**kwargs, prefix=prefix) + + # add new window update, which expires previous windows + with store.start_partition_transaction(partition_num) as tx: + kwargs = dict(start_ms=20, end_ms=30, value=3, timestamp_ms=20) + expected_update_produce_keys.append( + tx._serialize_key( + encode_window_key(kwargs["start_ms"], kwargs["end_ms"]), + prefix=prefix, + ) + ) + expected_update_produce_values.append(tx._serialize_value(kwargs["value"])) + tx.update_window(**kwargs, prefix=prefix) + expired = tx.expire_windows(duration_ms=10, prefix=prefix) + # "expire_windows" must update the expiration index so that the same + # windows are not expired twice + assert not tx.expire_windows(duration_ms=10, prefix=prefix) assert expired == [ ((w["start_ms"], w["end_ms"]), w["value"]) for w in expected_expired_windows @@ -436,7 +476,7 @@ def test_expire_windows_expired(self, windowed_rocksdb_store_factory_changelog): produce_calls.append( call( - key=key + PREFIX_SEPARATOR + LATEST_EXPIRED_WINDOW_TIMESTAMP_KEY, + key=prefix + PREFIX_SEPARATOR + LATEST_EXPIRED_WINDOW_TIMESTAMP_KEY, value=str(expected_expired_windows[-1]["start_ms"]).encode(), headers={CHANGELOG_CF_MESSAGE_HEADER: LATEST_EXPIRED_WINDOW_CF_NAME}, topic=store_partition._changelog_producer._changelog_name, @@ -448,10 +488,10 @@ def test_expire_windows_expired(self, windowed_rocksdb_store_factory_changelog): assert producer.produce.call_count == len(produce_calls) with store.start_partition_transaction(0) as tx: - with tx.with_prefix(b"__key__"): - assert tx.get_window(start_ms=0, end_ms=10) is None - assert tx.get_window(start_ms=10, end_ms=20) is None - assert tx.get_window(start_ms=20, end_ms=30) == 3 + prefix = b"__key__" + assert tx.get_window(start_ms=0, end_ms=10, prefix=prefix) is None + assert tx.get_window(start_ms=10, end_ms=20, prefix=prefix) is None + assert tx.get_window(start_ms=20, end_ms=30, prefix=prefix) == 3 def test_expire_windows_cached(self, windowed_rocksdb_store_factory_changelog): """ @@ -466,6 +506,7 @@ def test_expire_windows_cached(self, windowed_rocksdb_store_factory_changelog): store_partition = store.assign_partition(partition_num) producer = store_partition._changelog_producer._producer key = b"__key__" + expected_update_produce_keys = [] expected_update_produce_values = [] update_windows = [ @@ -476,26 +517,25 @@ def test_expire_windows_cached(self, windowed_rocksdb_store_factory_changelog): expected_expired_windows = update_windows[:2] with store.start_partition_transaction(0) as tx: - with tx.with_prefix(b"__key__"): - for kwargs in update_windows: - serialized_key = tx._serialize_key( - encode_window_key( - start_ms=kwargs["start_ms"], end_ms=kwargs["end_ms"] - ) + + for kwargs in update_windows: + serialized_key = tx._serialize_key( + encode_window_key(kwargs["start_ms"], kwargs["end_ms"]), + prefix=key, + ) + tx.update_window(**kwargs, prefix=key) + expected_update_produce_keys.append(serialized_key) + if kwargs in expected_expired_windows: + expected_update_produce_values.append(None) + else: + expected_update_produce_values.append( + tx._serialize_value(kwargs["value"]) ) - tx.update_window(**kwargs) - expected_update_produce_keys.append(serialized_key) - if kwargs in expected_expired_windows: - expected_update_produce_values.append(None) - else: - expected_update_produce_values.append( - tx._serialize_value(kwargs["value"]) - ) - - expired = tx.expire_windows(duration_ms=10) - # "expire_windows" must update the expiration index so that the same - # windows are not expired twice - assert not tx.expire_windows(duration_ms=10) + + expired = tx.expire_windows(duration_ms=10, prefix=key) + # "expire_windows" must update the expiration index so that the same + # windows are not expired twice + assert not tx.expire_windows(duration_ms=10, prefix=key) assert expired == [ ((w["start_ms"], w["end_ms"]), w["value"]) for w in expected_expired_windows @@ -528,7 +568,6 @@ def test_expire_windows_cached(self, windowed_rocksdb_store_factory_changelog): assert producer.produce.call_count == len(produce_calls) with store.start_partition_transaction(0) as tx: - with tx.with_prefix(b"__key__"): - assert tx.get_window(start_ms=0, end_ms=10) is None - assert tx.get_window(start_ms=10, end_ms=20) is None - assert tx.get_window(start_ms=20, end_ms=30) == 3 + assert tx.get_window(start_ms=0, end_ms=10, prefix=key) is None + assert tx.get_window(start_ms=10, end_ms=20, prefix=key) is None + assert tx.get_window(start_ms=20, end_ms=30, prefix=key) == 3 From 877ead967c4840d89c9324f2c8581a4ff6aa6c1c Mon Sep 17 00:00:00 2001 From: Daniil Gusev Date: Fri, 12 Apr 2024 22:35:49 +0200 Subject: [PATCH 03/28] Accept delivery callbacks in Producer.produce() --- quixstreams/kafka/producer.py | 43 ++++++------------- .../test_kafka/test_producer.py | 13 ++++++ 2 files changed, 26 insertions(+), 30 deletions(-) diff --git a/quixstreams/kafka/producer.py b/quixstreams/kafka/producer.py index 841065b1f..63be26aeb 100644 --- a/quixstreams/kafka/producer.py +++ b/quixstreams/kafka/producer.py @@ -1,13 +1,14 @@ import logging -from typing import Union, Optional -from typing_extensions import Literal -from quixstreams.models.types import Headers +from typing import Union, Optional, Callable from confluent_kafka import ( Producer as ConfluentProducer, KafkaError, Message, ) +from typing_extensions import Literal + +from quixstreams.models.types import Headers __all__ = ( "Producer", @@ -18,6 +19,8 @@ "random", "consistent_random", "murmur2", "murmur2_random", "fnv1a", "fnv1a_random" ] +DeliveryCallback = Callable[[Optional[KafkaError], Message], None] + logger = logging.getLogger(__name__) @@ -31,26 +34,6 @@ def _default_error_cb(error: KafkaError): ) -def _on_delivery_cb(err: Optional[KafkaError], msg: Message): - if err is not None: - logger.debug( - 'Delivery failed: topic="%s" partition="%s" key="%s" error=%s ' "code=%s", - msg.topic(), - msg.partition(), - msg.key(), - err.str(), - err.code(), - ) - else: - logger.debug( - 'Delivery succeeded: topic="%s" partition="%s" key="%s" value="%s"', - msg.topic(), - msg.partition(), - msg.key(), - msg.value(), - ) - - class Producer: def __init__( self, @@ -87,9 +70,6 @@ def __init__( ) self._producer_config = config self._inner_producer: Optional[ConfluentProducer] = None - # Optimization: pass `on_delivery` callbacks only in "debug" mode, otherwise - # it significantly reduces a throughput because of additional function calls - self._enable_delivery_callbacks = logger.isEnabledFor(logging.DEBUG) def produce( self, @@ -101,10 +81,12 @@ def produce( timestamp: Optional[int] = None, poll_timeout: float = 5.0, buffer_error_max_tries: int = 3, + on_delivery: Optional[DeliveryCallback] = None, ): """ - Produce message to topic. - It also polls Kafka for callbacks before producing in order to minimize + Produce a message to a topic. + + It also polls Kafka for callbacks before producing to minimize the probability of `BufferError`. If `BufferError` still happens, the method will poll Kafka with timeout to free up the buffer and try again. @@ -118,6 +100,8 @@ def produce( :param poll_timeout: timeout for `poll()` call in case of `BufferError` :param buffer_error_max_tries: max retries for `BufferError`. Pass `0` to not retry after `BufferError`. + :param on_delivery: the delivery callback to be triggered on `poll()` + for the produced message. """ @@ -125,9 +109,8 @@ def produce( "partition": partition, "timestamp": timestamp, "headers": headers, + "on_delivery": on_delivery, } - if self._enable_delivery_callbacks: - kwargs["on_delivery"] = _on_delivery_cb # confluent_kafka doesn't like None for optional parameters kwargs = {k: v for k, v in kwargs.items() if v is not None} diff --git a/tests/test_quixstreams/test_kafka/test_producer.py b/tests/test_quixstreams/test_kafka/test_producer.py index 0d6b1f403..470495f12 100644 --- a/tests/test_quixstreams/test_kafka/test_producer.py +++ b/tests/test_quixstreams/test_kafka/test_producer.py @@ -16,6 +16,19 @@ def test_produce(self, producer, topic_factory): ) producer.poll(1.0) + def test_produce_on_delivery_callback(self, producer, topic_factory): + topic_name, _ = topic_factory() + + offsets = [] + with producer: + producer.produce( + topic=topic_name, + key="test", + value=b"test", + on_delivery=lambda error, msg: offsets.append(msg.offset()), + ) + assert len(offsets) == 1 + def test_produce_failure_no_error(self, producer_factory, topic_factory): topic_name, _ = topic_factory() extra_config = { From d4b5f3797d31ead13cfe4d7f4a079d2fbf672ecf Mon Sep 17 00:00:00 2001 From: Daniil Gusev Date: Fri, 19 Apr 2024 12:06:45 +0200 Subject: [PATCH 04/28] Update RowProducer to track message delivery - Save the latest produced TPs and offsets using delivery callbacks - If delivery callback returns an error, raise it on next produce() or flush() - Move KafkaMessageError outside `rowconsumer.py` and rename it to KafkaException --- quixstreams/kafka/exceptions.py | 26 +++++ quixstreams/rowconsumer.py | 48 +-------- quixstreams/rowproducer.py | 119 ++++++++++++++------- tests/test_quixstreams/test_app.py | 8 +- tests/test_quixstreams/test_rowconsumer.py | 8 +- tests/test_quixstreams/test_rowproducer.py | 52 +++++++-- 6 files changed, 166 insertions(+), 95 deletions(-) create mode 100644 quixstreams/kafka/exceptions.py diff --git a/quixstreams/kafka/exceptions.py b/quixstreams/kafka/exceptions.py new file mode 100644 index 000000000..70c3feab4 --- /dev/null +++ b/quixstreams/kafka/exceptions.py @@ -0,0 +1,26 @@ +from confluent_kafka import KafkaError + +from quixstreams.exceptions import QuixException + + +class KafkaException(QuixException): + def __init__(self, error: KafkaError): + self.error = error + + @property + def code(self) -> int: + return self.error.code() + + @property + def description(self): + return self.error.str() + + def __str__(self): + return ( + f"<{self.__class__.__name__} " + f'code="{self.code}" ' + f'description="{self.description}">' + ) + + def __repr__(self): + return str(self) diff --git a/quixstreams/rowconsumer.py b/quixstreams/rowconsumer.py index e56a2013f..977e4e3e4 100644 --- a/quixstreams/rowconsumer.py +++ b/quixstreams/rowconsumer.py @@ -2,59 +2,21 @@ from typing import Optional, Callable, List, Union, Mapping from confluent_kafka import KafkaError, TopicPartition -from typing_extensions import Protocol from .error_callbacks import ConsumerErrorCallback, default_on_consumer_error -from .exceptions import QuixException, PartitionAssignmentError +from .exceptions import PartitionAssignmentError from .kafka import Consumer, AssignmentStrategy, AutoOffsetReset from .kafka.consumer import RebalancingCallback +from .kafka.exceptions import KafkaException from .models import Topic, Row from .models.serializers.exceptions import IgnoreMessage logger = logging.getLogger(__name__) +__all__ = ("RowConsumer",) -class KafkaMessageError(QuixException): - def __init__(self, error: KafkaError): - self.error = error - @property - def code(self) -> int: - return self.error.code() - - @property - def description(self): - return self.error.str() - - def __str__(self): - return ( - f"<{self.__class__.__name__} " - f'code="{self.code}" ' - f'description="{self.description}">' - ) - - def __repr__(self): - return str(self) - - -class RowConsumerProto(Protocol): - def commit( - self, - message=None, - offsets: List[TopicPartition] = None, - asynchronous: bool = True, - ) -> Optional[List[TopicPartition]]: ... - - def subscribe( - self, - topics: List[Topic], - on_assign: Optional[RebalancingCallback] = None, - on_revoke: Optional[RebalancingCallback] = None, - on_lost: Optional[RebalancingCallback] = None, - ): ... - - -class RowConsumer(Consumer, RowConsumerProto): +class RowConsumer(Consumer): def __init__( self, broker_address: str, @@ -172,7 +134,7 @@ def poll_row(self, timeout: float = None) -> Union[Row, List[Row], None]: topic_name, partition, offset = msg.topic(), msg.partition(), msg.offset() try: if msg.error(): - raise KafkaMessageError(error=msg.error()) + raise KafkaException(error=msg.error()) topic = self._topics[topic_name] diff --git a/quixstreams/rowproducer.py b/quixstreams/rowproducer.py index ab6845ed6..903bbc261 100644 --- a/quixstreams/rowproducer.py +++ b/quixstreams/rowproducer.py @@ -1,49 +1,36 @@ import logging -from typing import Optional, Any +from typing import Optional, Any, Union, Dict, Tuple -from typing_extensions import Protocol +from confluent_kafka import KafkaError, Message from .error_callbacks import ProducerErrorCallback, default_on_producer_error +from .kafka.exceptions import KafkaException from .kafka.producer import Producer, Partitioner -from .models import Topic, Row +from .models import Topic, Row, Headers logger = logging.getLogger(__name__) -class RowProducerProto(Protocol): - def produce_row( - self, - row: Row, - topic: Topic, - key: Optional[Any] = None, - partition: Optional[int] = None, - timestamp: Optional[int] = None, - ): ... - - -class RowProducer(Producer, RowProducerProto): +class RowProducer: """ A producer class that is capable of serializing Rows to bytes and send them to Kafka. The serialization is performed according to the Topic serialization settings. - It overrides `.subscribe()` method of Consumer class to accept `Topic` - objects instead of strings. - - :param broker_address: Kafka broker host and port in format `:`. - Passed as `bootstrap.servers` to `confluent_kafka.Producer`. - :param partitioner: A function to be used to determine the outgoing message - partition. - Available values: "random", "consistent_random", "murmur2", "murmur2_random", - "fnv1a", "fnv1a_random" - Default - "murmur2". - :param extra_config: A dictionary with additional options that - will be passed to `confluent_kafka.Producer` as is. - Note: values passed as arguments override values in `extra_config`. - :param on_error: a callback triggered when `RowProducer.produce_row()` - or `RowProducer.poll()` fail`. - If producer fails and the callback returns `True`, the exception - will be logged but not propagated. - The default callback logs an exception and returns `False`. + :param broker_address: Kafka broker host and port in format `:`. + Passed as `bootstrap.servers` to `confluent_kafka.Producer`. + :param partitioner: A function to be used to determine the outgoing message + partition. + Available values: "random", "consistent_random", "murmur2", "murmur2_random", + "fnv1a", "fnv1a_random" + Default - "murmur2". + :param extra_config: A dictionary with additional options that + will be passed to `confluent_kafka.Producer` as is. + Note: values passed as arguments override values in `extra_config`. + :param on_error: a callback triggered when `RowProducer.produce_row()` + or `RowProducer.poll()` fail`. + If producer fails and the callback returns `True`, the exception + will be logged but not propagated. + The default callback logs an exception and returns `False`. """ def __init__( @@ -53,14 +40,17 @@ def __init__( extra_config: dict = None, on_error: Optional[ProducerErrorCallback] = None, ): - super().__init__( + self._producer = Producer( broker_address=broker_address, partitioner=partitioner, extra_config=extra_config, ) + self._on_error: Optional[ProducerErrorCallback] = ( on_error or default_on_producer_error ) + self._tp_offsets: Dict[Tuple[str, int], int] = {} + self._error: Optional[KafkaError] = None def produce_row( self, @@ -103,14 +93,71 @@ def poll(self, timeout: float = None): """ Polls the producer for events and calls `on_delivery` callbacks. - If poll fails, it will trigger the provided "on_error" callback + If `poll()` fails, it will trigger the provided "on_error" callback :param timeout: timeout in seconds """ try: - super().poll(timeout=timeout) + self._producer.poll(timeout=timeout) except Exception as exc: to_suppress = self._on_error(exc, None, logger) if to_suppress: return raise + + def produce( + self, + topic: str, + value: Optional[Union[str, bytes]] = None, + key: Optional[Union[str, bytes]] = None, + headers: Optional[Headers] = None, + partition: Optional[int] = None, + timestamp: Optional[int] = None, + poll_timeout: float = 5.0, + buffer_error_max_tries: int = 3, + ): + self._raise_for_error() + + return self._producer.produce( + topic=topic, + value=value, + key=key, + headers=headers, + partition=partition, + timestamp=timestamp, + poll_timeout=poll_timeout, + buffer_error_max_tries=buffer_error_max_tries, + on_delivery=self._on_delivery, + ) + + def _on_delivery(self, err: Optional[KafkaError], msg: Message): + if self._error is not None: + # There's an error already set + return + + topic, partition, offset = msg.topic(), msg.partition(), msg.offset() + if err is None: + self._tp_offsets[(topic, partition)] = offset + else: + self._error = err + + def _raise_for_error(self): + if self._error is not None: + exc = KafkaException(self._error) + self._error = None + raise exc + + def flush(self, timeout: Optional[float] = None) -> int: + result = self._producer.flush(timeout=timeout) + self._raise_for_error() + return result + + @property + def offsets(self) -> Dict[Tuple[str, int], int]: + return self._tp_offsets + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.flush() diff --git a/tests/test_quixstreams/test_app.py b/tests/test_quixstreams/test_app.py index 89f107dcc..9c1edcf3f 100644 --- a/tests/test_quixstreams/test_app.py +++ b/tests/test_quixstreams/test_app.py @@ -12,6 +12,7 @@ from quixstreams.app import Application from quixstreams.dataframe import StreamingDataFrame from quixstreams.dataframe.windows.base import get_window_ranges +from quixstreams.kafka.exceptions import KafkaException from quixstreams.models import ( DoubleDeserializer, DoubleSerializer, @@ -24,10 +25,7 @@ QuixKafkaConfigsBuilder, ) from quixstreams.platforms.quix.env import QuixEnvironment -from quixstreams.rowconsumer import ( - KafkaMessageError, - RowConsumer, -) +from quixstreams.rowconsumer import RowConsumer from quixstreams.state import State from tests.utils import TopicPartitionStub @@ -184,7 +182,7 @@ def test_run_consumer_error_raised(self, app_factory, executor): # Stop app after 10s if nothing failed executor.submit(_stop_app_on_timeout, app, 10.0) - with pytest.raises(KafkaMessageError): + with pytest.raises(KafkaException): app.run(sdf) def test_run_deserialization_error_raised(self, app_factory, executor): diff --git a/tests/test_quixstreams/test_rowconsumer.py b/tests/test_quixstreams/test_rowconsumer.py index a61fe4045..f4dcfec44 100644 --- a/tests/test_quixstreams/test_rowconsumer.py +++ b/tests/test_quixstreams/test_rowconsumer.py @@ -7,7 +7,7 @@ IgnoreMessage, SerializationError, ) -from quixstreams.rowconsumer import KafkaMessageError +from quixstreams.kafka.exceptions import KafkaException from tests.utils import Timeout @@ -61,7 +61,7 @@ def test_poll_row_kafka_error( auto_offset_reset="earliest", ) as consumer: consumer.subscribe([topic]) - with pytest.raises(KafkaMessageError) as raised: + with pytest.raises(KafkaException) as raised: consumer.poll_row(10.0) exc = raised.value assert exc.code == KafkaError.UNKNOWN_TOPIC_OR_PART @@ -112,7 +112,7 @@ def test_poll_row_kafka_error_raise( producer.produce(topic.name, key=b"key", value=b"value") producer.flush() consumer.subscribe([topic]) - with pytest.raises(KafkaMessageError): + with pytest.raises(KafkaException): consumer.poll_row(10.0) def test_poll_row_deserialization_error_suppress( @@ -147,7 +147,7 @@ def test_poll_row_kafka_error_suppress( suppressed = False def on_error(exc, *args): - assert isinstance(exc, KafkaMessageError) + assert isinstance(exc, KafkaException) nonlocal suppressed suppressed = True return True diff --git a/tests/test_quixstreams/test_rowproducer.py b/tests/test_quixstreams/test_rowproducer.py index 1a06cdfb6..59f15b09d 100644 --- a/tests/test_quixstreams/test_rowproducer.py +++ b/tests/test_quixstreams/test_rowproducer.py @@ -1,7 +1,8 @@ from concurrent.futures import Future import pytest -from confluent_kafka import KafkaException +from confluent_kafka import KafkaException as ConfluentKafkaException +from quixstreams.kafka.exceptions import KafkaException from quixstreams.models import ( JSONSerializer, @@ -17,15 +18,12 @@ def test_produce_row_success( topic_json_serdes_factory, row_factory, ): - topic = topic_json_serdes_factory() - + topic = topic_json_serdes_factory(num_partitions=1) key = b"key" value = {"field": "value"} headers = [("header1", b"1")] - with row_consumer_factory( - auto_offset_reset="earliest" - ) as consumer, row_producer_factory() as producer: + with row_producer_factory() as producer: row = row_factory( topic=topic.name, value=value, @@ -33,8 +31,14 @@ def test_produce_row_success( headers=headers, ) producer.produce_row(topic=topic, row=row) + + with row_consumer_factory(auto_offset_reset="earliest") as consumer: consumer.subscribe([topic]) row = consumer.poll_row(timeout=5.0) + + assert producer.offsets + assert producer.offsets.get((topic.name, 0)) is not None + assert row assert row.key == key assert row.value == value @@ -97,7 +101,7 @@ def test_produce_row_produce_error_raise( topic=topic.name, value={"field": 1001 * "a"}, ) - with pytest.raises(KafkaException): + with pytest.raises(ConfluentKafkaException): producer.produce_row(topic=topic, row=row) def test_produce_row_serialization_error_suppress( @@ -122,3 +126,37 @@ def on_error(exc, *args): value=object(), ) producer.produce_row(topic=topic, row=row) + + def test_produce_delivery_error_raised_on_produce( + self, row_producer_factory, topic_json_serdes_factory + ): + topic = topic_json_serdes_factory(num_partitions=1) + key = b"key" + value = b"value" + + producer = row_producer_factory() + + # Send message to a non-existing partition to simulate error + # in the delivery callback + producer.produce(topic=topic.name, key=key, value=value, partition=3) + # Poll for delivery callbacks + producer.poll(5) + # The next produce should fail after + with pytest.raises(KafkaException): + producer.produce(topic=topic.name, key=key, value=value) + + def test_produce_delivery_error_raised_on_flush( + self, row_producer_factory, topic_json_serdes_factory + ): + topic = topic_json_serdes_factory(num_partitions=1) + key = b"key" + value = b"value" + + producer = row_producer_factory() + + # Send message to a non-existing partition to simulate error + # in the delivery callback + producer.produce(topic=topic.name, key=key, value=value, partition=3) + # The flush should fail after that + with pytest.raises(KafkaException): + producer.flush() From 0267e00231b7d5f0ada97f7a021ca775e5404a57 Mon Sep 17 00:00:00 2001 From: Daniil Gusev Date: Fri, 19 Apr 2024 22:28:15 +0200 Subject: [PATCH 05/28] Separate changelog producing and flush in State - Make flushing state to the disk and producing changelogs separate operations - Rename "maybe_flush" -> "flush" --- quixstreams/state/rocksdb/exceptions.py | 4 + quixstreams/state/rocksdb/partition.py | 17 +- quixstreams/state/rocksdb/transaction.py | 249 ++++++++++------ .../state/rocksdb/windowed/transaction.py | 10 +- quixstreams/state/types.py | 106 ++++++- .../test_state/test_recovery.py | 2 +- .../test_rocksdb/test_transaction.py | 191 +++++++------ .../test_rocksdb/test_windowed/fixtures.py | 26 +- .../test_windowed/test_transaction.py | 269 +++--------------- 9 files changed, 446 insertions(+), 428 deletions(-) diff --git a/quixstreams/state/rocksdb/exceptions.py b/quixstreams/state/rocksdb/exceptions.py index 063fa49fc..008f44b6c 100644 --- a/quixstreams/state/rocksdb/exceptions.py +++ b/quixstreams/state/rocksdb/exceptions.py @@ -6,6 +6,7 @@ "ColumnFamilyDoesNotExist", "ColumnFamilyAlreadyExists", "ColumnFamilyHeaderMissing", + "InvalidChangelogOffset", ) @@ -25,3 +26,6 @@ class ColumnFamilyAlreadyExists(StateError): ... class ColumnFamilyHeaderMissing(StateError): ... + + +class InvalidChangelogOffset(StateError): ... diff --git a/quixstreams/state/rocksdb/partition.py b/quixstreams/state/rocksdb/partition.py index cf78cb45b..8aab50316 100644 --- a/quixstreams/state/rocksdb/partition.py +++ b/quixstreams/state/rocksdb/partition.py @@ -1,6 +1,6 @@ import logging import time -from typing import Any, Union, Optional, List, Dict +from typing import Any, Union, Optional, List, Dict, Tuple from rocksdict import WriteBatch, Rdict, ColumnFamily, AccessType, WriteOptions @@ -77,6 +77,21 @@ def __init__( def using_changelogs(self) -> bool: return bool(self._changelog_producer) + @property + def changelog_topic_partition(self) -> Optional[Tuple[str, int]]: + """ + Return the changelog topic-partition for the given StorePartition. + + Returns `None` if changelog_producer is not provided. + + :return: (topic, partition) or None + """ + if self._changelog_producer is not None: + return ( + self._changelog_producer.changelog_name, + self._changelog_producer.partition, + ) + def begin( self, ) -> RocksDBPartitionTransaction: diff --git a/quixstreams/state/rocksdb/transaction.py b/quixstreams/state/rocksdb/transaction.py index e1ee5db20..5ac6c2195 100644 --- a/quixstreams/state/rocksdb/transaction.py +++ b/quixstreams/state/rocksdb/transaction.py @@ -1,17 +1,16 @@ import functools import logging -from typing import Any, Union, Optional, Dict, NewType, TYPE_CHECKING +from typing import Any, Union, Optional, Dict, NewType, TYPE_CHECKING, Tuple -from rocksdict import WriteBatch, ColumnFamily +from rocksdict import WriteBatch from quixstreams.state.types import ( DumpsFunc, LoadsFunc, PartitionTransaction, + PartitionTransactionStatus, ) -from .exceptions import ( - StateTransactionError, -) +from .exceptions import StateTransactionError, InvalidChangelogOffset from .metadata import ( METADATA_CF_NAME, PROCESSED_OFFSET_KEY, @@ -19,16 +18,14 @@ PREFIX_SEPARATOR, CHANGELOG_CF_MESSAGE_HEADER, ) -from .serialization import ( - serialize, - deserialize, - int_to_int64_bytes, -) +from .serialization import serialize, deserialize, int_to_int64_bytes from ..state import TransactionState if TYPE_CHECKING: from .partition import RocksDBStorePartition +__all__ = ("RocksDBPartitionTransaction", "DEFAULT_PREFIX", "DELETED") + logger = logging.getLogger(__name__) Undefined = NewType("Undefined", object) @@ -38,27 +35,23 @@ DEFAULT_PREFIX = b"" -__all__ = ("RocksDBPartitionTransaction", "DEFAULT_PREFIX", "DELETED") - -def _validate_transaction_state(func): +def _validate_transaction_status(*allowed: PartitionTransactionStatus): """ - Check that the state of `RocksDBTransaction` is valid before calling a method + Check that the status of `RocksDBTransaction` is valid before calling a method """ - @functools.wraps(func) - def wrapper(*args, **kwargs): - self: RocksDBPartitionTransaction = args[0] - if self.failed: - raise StateTransactionError( - "Transaction is failed, create a new one to proceed" - ) - if self.completed: - raise StateTransactionError( - "Transaction is already finished, create a new one to proceed" - ) + def wrapper(func): + @functools.wraps(func) + def _wrapper(tx: "RocksDBPartitionTransaction", *args, **kwargs): + if tx.status not in allowed: + raise StateTransactionError( + f"Invalid transaction status {tx.status}, " f"allowed: {allowed}" + ) + + return func(tx, *args, **kwargs) - return func(*args, **kwargs) + return _wrapper return wrapper @@ -86,7 +79,7 @@ class RocksDBPartitionTransaction(PartitionTransaction): within the transaction before it's flushed (aka "read-your-own-writes" problem). If any mutation fails during the transaction - (e.g. we failed to write the updates to the RocksDB), the whole transaction + (e.g., failed to write the updates to the RocksDB), the whole transaction will be marked as failed and cannot be used anymore. In this case, a new `RocksDBTransaction` should be created. @@ -97,10 +90,9 @@ class RocksDBPartitionTransaction(PartitionTransaction): "_partition", "_update_cache", "_batch", - "_failed", - "_completed", "_dumps", "_loads", + "_status", ) def __init__( @@ -120,20 +112,20 @@ def __init__( str, Dict[bytes, Dict[bytes, Union[bytes, Undefined]]] ] = {"default": {}} self._batch = WriteBatch(raw_mode=True) - self._failed = False - self._completed = False self._dumps = dumps self._loads = loads + self._status = PartitionTransactionStatus.STARTED def as_state(self, prefix: Any = DEFAULT_PREFIX) -> TransactionState: """ - Create a one-time `TransactionState` object with a limited CRUD interface. + Create a one-time use `TransactionState` object with a limited CRUD interface + to be provided to `StreamingDataFrame` operations. The `TransactionState` will prefix all the keys with the supplied `prefix` for all underlying operations. :param prefix: a prefix to be used for all keys - :return: + :return: an instance of `TransactionState` """ return TransactionState( transaction=self, @@ -144,7 +136,7 @@ def as_state(self, prefix: Any = DEFAULT_PREFIX) -> TransactionState: ), ) - @_validate_transaction_state + @_validate_transaction_status(PartitionTransactionStatus.STARTED) def get( self, key: Any, prefix: bytes, default: Any = None, cf_name: str = "default" ) -> Optional[Any]: @@ -184,7 +176,7 @@ def get( return self._deserialize_value(stored) return default - @_validate_transaction_state + @_validate_transaction_status(PartitionTransactionStatus.STARTED) def set(self, key: Any, value: Any, prefix: bytes, cf_name: str = "default"): """ Set a key to the store. @@ -204,10 +196,10 @@ def set(self, key: Any, value: Any, prefix: bytes, cf_name: str = "default"): key_serialized ] = value_serialized except Exception: - self._failed = True + self._status = PartitionTransactionStatus.FAILED raise - @_validate_transaction_state + @_validate_transaction_status(PartitionTransactionStatus.STARTED) def delete(self, key: Any, prefix: bytes, cf_name: str = "default"): """ Delete a key from the store. @@ -225,10 +217,10 @@ def delete(self, key: Any, prefix: bytes, cf_name: str = "default"): ] = DELETED except Exception: - self._failed = True + self._status = PartitionTransactionStatus.FAILED raise - @_validate_transaction_state + @_validate_transaction_status(PartitionTransactionStatus.STARTED) def exists(self, key: Any, prefix: bytes, cf_name: str = "default") -> bool: """ Check if a key exists in the store. @@ -255,6 +247,10 @@ def exists(self, key: Any, prefix: bytes, cf_name: str = "default") -> bool: return self._partition.exists(key_serialized, cf_name=cf_name) + @property + def status(self) -> PartitionTransactionStatus: + return self._status + @property def completed(self) -> bool: """ @@ -267,7 +263,19 @@ def completed(self) -> bool: :return: `True` if transaction is completed, `False` otherwise. """ - return self._completed + return self._status == PartitionTransactionStatus.COMPLETE + + @property + def prepared(self) -> bool: + """ + Check if the transaction is in PREPARED status. + + Prepared transaction successefully flushed its changelog and cannot receive + updates anymore, but its state is not yet flushed to the disk + + :return: `True` if transaction is prepared, `False` otherwise. + """ + return self._status == PartitionTransactionStatus.PREPARED @property def failed(self) -> bool: @@ -279,72 +287,143 @@ def failed(self) -> bool: :return: `True` if transaction is failed, `False` otherwise. """ - return self._failed - - def _update_changelog(self, meta_cf_handle: ColumnFamily): - logger.debug("Flushing state changes to the changelog topic...") - offset = self._partition.get_changelog_offset() or 0 - + return self._status == PartitionTransactionStatus.FAILED + + def _produce_changelog(self, processed_offset: Optional[int] = None): + if not self._partition.using_changelogs: + return + + # TODO: Add topic offset to the changelog headers + changelog_topic, changelog_partition = self._partition.changelog_topic_partition + logger.debug( + f"Flushing state changes to the changelog topic " + f'topic_name="{changelog_topic}" ' + f"partition={changelog_partition} " + f"processed_offset={processed_offset}" + ) + # Iterate over the transaction update cache for cf_name, cf_update_cache in self._update_cache.items(): headers = {CHANGELOG_CF_MESSAGE_HEADER: cf_name} for _prefix, prefix_update_cache in cf_update_cache.items(): for key, value in prefix_update_cache.items(): + # Produce changes to the changelog topic self._partition.produce_to_changelog( key=key, value=value if value is not DELETED else None, headers=headers, ) - offset += 1 - self._batch.put( - CHANGELOG_OFFSET_KEY, int_to_int64_bytes(offset), meta_cf_handle - ) - logger.debug(f"Changelog offset set to {offset}") + def _flush_state( + self, + processed_offset: Optional[int] = None, + changelog_offset: Optional[int] = None, + ): + meta_cf_handle = self._partition.get_column_family_handle(METADATA_CF_NAME) + # Iterate over the transaction update cache + for cf_name, cf_update_cache in self._update_cache.items(): + cf_handle = self._partition.get_column_family_handle(cf_name) + for _prefix, prefix_update_cache in cf_update_cache.items(): + for key, value in prefix_update_cache.items(): + # Apply changes to the Writebatch + if value is DELETED: + self._batch.delete(key, cf_handle) + else: + self._batch.put(key, value, cf_handle) + + if not len(self._batch): + # Exit early if transaction doesn't update anything + return + + # Save the latest processed input topic offset + if processed_offset is not None: + self._batch.put( + PROCESSED_OFFSET_KEY, + int_to_int64_bytes(processed_offset), + meta_cf_handle, + ) + # Save the latest changelog topic offset to know where to recover from + # It may be None if changelog topics are disabled + if changelog_offset is not None: + current_changelog_offset = self._partition.get_changelog_offset() + if ( + current_changelog_offset is not None + and changelog_offset < current_changelog_offset + ): + raise InvalidChangelogOffset( + f"Cannot set changelog offset lower than already saved one" + ) + self._batch.put( + CHANGELOG_OFFSET_KEY, + int_to_int64_bytes(changelog_offset), + meta_cf_handle, + ) + self._partition.write(self._batch) + + @_validate_transaction_status(PartitionTransactionStatus.STARTED) + def prepare(self, processed_offset: Optional[int] = None): + """ + Produce changelog messages to the changelog topic for all changes accumulated + in this transaction and prepare transcation to flush its state to the state + store. + + After successful `prepare()`, the transaction status is changed to PREPARED, + and it cannot receive updates anymore. + + If changelog is disabled for this application, no updates will be produced + to the changelog topic. + + :param processed_offset: the offset of the latest processed message + """ + try: + self._produce_changelog(processed_offset=processed_offset) + self._status = PartitionTransactionStatus.PREPARED + except Exception: + self._status = PartitionTransactionStatus.FAILED + raise - @_validate_transaction_state - def maybe_flush(self, offset: Optional[int] = None): + @_validate_transaction_status( + PartitionTransactionStatus.STARTED, PartitionTransactionStatus.PREPARED + ) + def flush( + self, + processed_offset: Optional[int] = None, + changelog_offset: Optional[int] = None, + ): """ - Flush the recent updates to the database and empty the update cache. + Flush the recent updates to the database. It writes the WriteBatch to RocksDB and marks itself as finished. - If writing fails, the transaction will be also marked as "failed" and + If writing fails, the transaction is marked as failed and cannot be used anymore. >***NOTE:*** If no keys have been modified during the transaction (i.e. no "set" or "delete" have been called at least once), it will - not flush ANY data to the database including the offset in order to optimize + not flush ANY data to the database including the offset to optimize I/O. - :param offset: offset of the last processed message, optional. + :param processed_offset: offset of the last processed message, optional. + :param changelog_offset: offset of the last produced changelog message, + optional. """ try: - meta_cf_handle = self._partition.get_column_family_handle(METADATA_CF_NAME) - for cf_name, cf_update_cache in self._update_cache.items(): - cf_handle = self._partition.get_column_family_handle(cf_name) - for _prefix, prefix_update_cache in cf_update_cache.items(): - for key, value in prefix_update_cache.items(): - if value is DELETED: - self._batch.delete(key, cf_handle) - else: - self._batch.put(key, value, cf_handle) - - # TODO: Maybe unify writebatch and changelog work here so we do only one pass - # through the update cache - - # Don't write batches if this transaction doesn't change any keys - if len(self._batch): - if offset is not None: - self._batch.put( - PROCESSED_OFFSET_KEY, int_to_int64_bytes(offset), meta_cf_handle - ) - if self._partition.using_changelogs: - self._update_changelog(meta_cf_handle) - self._partition.write(self._batch) + self._flush_state( + processed_offset=processed_offset, changelog_offset=changelog_offset + ) + self._status = PartitionTransactionStatus.COMPLETE except Exception: - self._failed = True + self._status = PartitionTransactionStatus.FAILED raise - finally: - self._completed = True + + @property + def changelog_topic_partition(self) -> Optional[Tuple[str, int]]: + """ + Return the changelog topic-partition for the StorePartition of this transaction. + + Returns `None` if changelog_producer is not provided. + + :return: (topic, partition) or None + """ + return self._partition.changelog_topic_partition def _serialize_value(self, value: Any) -> bytes: return serialize(value, dumps=self._dumps) @@ -361,5 +440,7 @@ def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): - if exc_val is None and not self._failed: - self.maybe_flush() + # Note: with state transactions, context managers are meant to be used mostly + # in tests + if exc_val is None and not self.failed: + self.flush() diff --git a/quixstreams/state/rocksdb/windowed/transaction.py b/quixstreams/state/rocksdb/windowed/transaction.py index 754bcd96a..d392cf71e 100644 --- a/quixstreams/state/rocksdb/windowed/transaction.py +++ b/quixstreams/state/rocksdb/windowed/transaction.py @@ -79,14 +79,20 @@ def delete_window(self, start_ms: int, end_ms: int, prefix: bytes): key = encode_window_key(start_ms, end_ms) self.delete(key=key, prefix=prefix) - def maybe_flush(self, offset: Optional[int] = None): + def flush( + self, + processed_offset: Optional[int] = None, + changelog_offset: Optional[int] = None, + ): cf_handle = self._partition.get_column_family_handle(METADATA_CF_NAME) self._batch.put( LATEST_TIMESTAMP_KEY, int_to_int64_bytes(self._latest_timestamp_ms), cf_handle, ) - super().maybe_flush(offset=offset) + super().flush( + processed_offset=processed_offset, changelog_offset=changelog_offset + ) self._partition.set_latest_timestamp(self._latest_timestamp_ms) def expire_windows( diff --git a/quixstreams/state/types.py b/quixstreams/state/types.py index 56cb77113..5c63ad3d5 100644 --- a/quixstreams/state/types.py +++ b/quixstreams/state/types.py @@ -1,4 +1,5 @@ -from typing import Protocol, Any, Optional, Callable, Dict, ClassVar +import enum +from typing import Protocol, Any, Optional, Callable, Dict, ClassVar, Tuple from quixstreams.models import ConfluentKafkaMessageProto from quixstreams.models.types import MessageHeadersMapping @@ -252,17 +253,59 @@ def failed(self) -> bool: @property def completed(self) -> bool: """ - Return `True` if transaction is completed. + Return `True` if transaction is successfully completed. Completed transactions cannot be re-used. :return: bool """ ... - def maybe_flush(self, offset: Optional[int] = None): + @property + def prepared(self) -> bool: + """ + Return `True` if transaction is prepared completed. + + Prepared transactions cannot receive new updates, but can be flushed. + :return: bool + """ + ... + + def prepare(self, processed_offset: Optional[int] = None): + """ + Produce changelog messages to the changelog topic for all changes accumulated + in this transaction and prepare transcation to flush its state to the state + store. + + After successful `prepare()`, the transaction status is changed to PREPARED, + and it cannot receive updates anymore. + + If changelog is disabled for this application, no updates will be produced + to the changelog topic. + + :param processed_offset: the offset of the latest processed message """ - Flush the recent updates and last processed offset to the storage. - :param offset: offset of the last processed message, optional. + + @property + def changelog_topic_partition(self) -> Optional[Tuple[str, int]]: + """ + Return the changelog topic-partition for the StorePartition of this transaction. + + Returns `None` if changelog_producer is not provided. + + :return: (topic, partition) or None + """ + + def flush( + self, + processed_offset: Optional[int] = None, + changelog_offset: Optional[int] = None, + ): + """ + Flush the recent updates to the storage. + + :param processed_offset: offset of the last processed message, optional. + :param changelog_offset: offset of the last produced changelog message, + optional. """ def __enter__(self): ... @@ -344,13 +387,38 @@ def failed(self) -> bool: @property def completed(self) -> bool: """ - Return `True` if transaction is completed. + Return `True` if transaction is successfully completed. Completed transactions cannot be re-used. :return: bool """ ... + @property + def prepared(self) -> bool: + """ + Return `True` if transaction is prepared completed. + + Prepared transactions cannot receive new updates, but can be flushed. + :return: bool + """ + ... + + def prepare(self, processed_offset: Optional[int] = None): + """ + Produce changelog messages to the changelog topic for all changes accumulated + in this transaction and prepare transcation to flush its state to the state + store. + + After successful `prepare()`, the transaction status is changed to PREPARED, + and it cannot receive updates anymore. + + If changelog is disabled for this application, no updates will be produced + to the changelog topic. + + :param processed_offset: the offset of the latest processed message + """ + def as_state(self, prefix: Any) -> WindowedState: ... def get_window( @@ -415,10 +483,17 @@ def expire_windows(self, duration_ms: int, prefix: bytes, grace_ms: int = 0): """ ... - def maybe_flush(self, offset: Optional[int] = None): + def flush( + self, + processed_offset: Optional[int] = None, + changelog_offset: Optional[int] = None, + ): """ - Flush the recent updates and last processed offset to the storage. - :param offset: offset of the last processed message, optional. + Flush the recent updates to the storage. + + :param processed_offset: offset of the last processed message, optional. + :param changelog_offset: offset of the last produced changelog message, + optional. """ def __enter__(self): ... @@ -435,6 +510,17 @@ def write_from_changelog_message(self): ... def flush(self): """ - Flush the recovery update and last processed offset to the storage. + Flush the recovery update to the storage. """ ... + + +class PartitionTransactionStatus(enum.Enum): + STARTED = 1 # Transaction is started and accepts updates + + PREPARED = 2 # Transaction is prepared, it can no longer receive updates + # and can only be flushed + + COMPLETE = 3 # Transaction is fully completed, it cannot be used anymore + + FAILED = 4 # Transaction is failed, it cannot be used anymore diff --git a/tests/test_quixstreams/test_state/test_recovery.py b/tests/test_quixstreams/test_state/test_recovery.py index b066ad1d6..f701b1f5e 100644 --- a/tests/test_quixstreams/test_state/test_recovery.py +++ b/tests/test_quixstreams/test_state/test_recovery.py @@ -84,7 +84,7 @@ def test_produce( writer = ChangelogProducer( changelog_name=changelog.name, - partition_num=p_num, + partition=p_num, producer=row_producer_factory(), ) writer.produce( diff --git a/tests/test_quixstreams/test_state/test_rocksdb/test_transaction.py b/tests/test_quixstreams/test_state/test_rocksdb/test_transaction.py index b41d3f6af..01aad679b 100644 --- a/tests/test_quixstreams/test_state/test_rocksdb/test_transaction.py +++ b/tests/test_quixstreams/test_state/test_rocksdb/test_transaction.py @@ -12,10 +12,9 @@ RocksDBStorePartition, RocksDBOptions, RocksDBPartitionTransaction, + InvalidChangelogOffset, ) -from quixstreams.state.rocksdb.metadata import ( - CHANGELOG_CF_MESSAGE_HEADER, -) +from quixstreams.state.rocksdb.metadata import CHANGELOG_CF_MESSAGE_HEADER from quixstreams.state.rocksdb.serialization import serialize from quixstreams.utils.json import dumps @@ -271,42 +270,28 @@ def test_update_key_failed_transaction_failed(self, operation, rocksdb_partition tx.exists("key", prefix=prefix) with pytest.raises(StateTransactionError): - tx.maybe_flush() + tx.flush() assert not tx.completed - def test_flush_failed_transaction_failed(self, rocksdb_partition): + def test_update_key_prepared_transaction_fails(self, rocksdb_partition): """ - Test that if the "maybe_flush()" fails the transaction is also marked - as failed and cannot be re-used anymore. + Test that any update operation (set or delete) fails if the transaction is + marked as prepared. """ prefix = b"__key__" - with patch.object( - RocksDBStorePartition, "write", side_effect=ValueError("test") - ): - with rocksdb_partition.begin() as tx: - tx.set("key", "value", prefix=prefix) + tx = rocksdb_partition.begin() - with contextlib.suppress(ValueError): - tx.maybe_flush() + tx.set(key="key", value="value", prefix=prefix) + tx.prepare() + assert tx.prepared - assert tx.failed + with pytest.raises(StateTransactionError): + tx.set("key", value="value", prefix=prefix) - # Ensure that Transaction cannot be used after it's failed - with pytest.raises(StateTransactionError): - tx.set("key", "value", prefix=prefix) - - with pytest.raises(StateTransactionError): - tx.get("key", prefix=prefix) - - with pytest.raises(StateTransactionError): - tx.delete("key", prefix=prefix) - - with pytest.raises(StateTransactionError): - tx.exists("key", prefix=prefix) - - assert tx.completed + with pytest.raises(StateTransactionError): + tx.delete("key", prefix=prefix) def test_transaction_not_flushed_on_error(self, rocksdb_partition): prefix = b"__key__" @@ -390,12 +375,69 @@ def test_set_exists_get_with_column_family(self, rocksdb_partition): with rocksdb_partition.begin() as tx: assert tx.exists(key, cf_name="cf", prefix=prefix) + def test_flush_failed_transaction_failed(self, rocksdb_partition): + """ + Test that if the "flush()" fails the transaction is also marked + as failed and cannot be re-used. + """ -class TestRocksDBPartitionTransactionChangelog: - def test_transaction_with_changelog_set( - self, rocksdb_partition_factory, changelog_producer_mock - ): + prefix = b"__key__" + with patch.object( + RocksDBStorePartition, "write", side_effect=ValueError("test") + ): + with rocksdb_partition.begin() as tx: + tx.set("key", "value", prefix=prefix) + + with contextlib.suppress(ValueError): + tx.flush() + assert tx.failed + + # Ensure that Transaction cannot be used after it's failed + with pytest.raises(StateTransactionError): + tx.set("key", "value", prefix=prefix) + + with pytest.raises(StateTransactionError): + tx.get("key", prefix=prefix) + + with pytest.raises(StateTransactionError): + tx.delete("key", prefix=prefix) + + with pytest.raises(StateTransactionError): + tx.exists("key", prefix=prefix) + + @pytest.mark.parametrize( + "processed_offset, changelog_offset", [(None, None), (1, 1)] + ) + def test_flush_success(self, processed_offset, changelog_offset, rocksdb_partition): + tx = rocksdb_partition.begin() + + # Set some key to probe the transaction + tx.set(key="key", value="value", prefix=b"__key__") + + tx.flush(processed_offset=processed_offset, changelog_offset=changelog_offset) + assert tx.completed + + assert rocksdb_partition.get_changelog_offset() == changelog_offset + assert rocksdb_partition.get_processed_offset() == processed_offset + + def test_flush_invalid_changelog_offset(self, rocksdb_partition): + tx1 = rocksdb_partition.begin() + # Set some key to probe the transaction + tx1.set(key="key", value="value", prefix=b"__key__") + + # Flush first transaction to update the changelog offset + tx1.flush(changelog_offset=9999) + assert tx1.completed + + tx2 = rocksdb_partition.begin() + tx2.set(key="key", value="value", prefix=b"__key__") + # Flush second transaction with a smaller changelog offset + with pytest.raises(InvalidChangelogOffset): + tx2.flush(changelog_offset=1) + assert tx2.failed + + def test_set_and_prepare(self, rocksdb_partition_factory, changelog_producer_mock): data = [ ("key1", "value1"), ("key2", "value2"), @@ -407,16 +449,15 @@ def test_transaction_with_changelog_set( with rocksdb_partition_factory( changelog_producer=changelog_producer_mock ) as partition: - assert partition.get_changelog_offset() is None - - with partition.begin() as tx: - for key, value in data: - tx.set( - key=key, - value=value, - cf_name=cf, - prefix=prefix, - ) + tx = partition.begin() + for key, value in data: + tx.set( + key=key, + value=value, + cf_name=cf, + prefix=prefix, + ) + tx.prepare() assert changelog_producer_mock.produce.call_count == len(data) for (key, value), call in zip( @@ -426,10 +467,9 @@ def test_transaction_with_changelog_set( assert call.kwargs["value"] == tx._serialize_value(value=value) assert call.kwargs["headers"] == {CHANGELOG_CF_MESSAGE_HEADER: cf} - assert tx.completed - assert partition.get_changelog_offset() == len(data) + assert tx.prepared - def test_transaction_with_changelog_delete( + def test_delete_and_prepare( self, rocksdb_partition_factory, changelog_producer_mock ): key, value = "key", "value" @@ -439,30 +479,22 @@ def test_transaction_with_changelog_delete( changelog_producer=changelog_producer_mock ) as partition: - assert partition.get_changelog_offset() is None - - with partition.begin() as tx: - tx.set(key=key, value=value, cf_name=cf, prefix=prefix) - - with partition.begin() as tx: - tx.delete(key=key, cf_name=cf, prefix=prefix) + tx = partition.begin() + tx.delete(key=key, cf_name=cf, prefix=prefix) - assert partition.get_changelog_offset() == 2 - assert changelog_producer_mock.produce.call_count == 2 + tx.prepare() - set_changelog = changelog_producer_mock.produce.call_args_list[0] - assert set_changelog.kwargs["key"] == tx._serialize_key(key=key, prefix=prefix) - assert set_changelog.kwargs["value"] == tx._serialize_value(value=value) - assert set_changelog.kwargs["headers"] == {CHANGELOG_CF_MESSAGE_HEADER: cf} + assert tx.prepared + assert changelog_producer_mock.produce.call_count == 1 - delete_changelog = changelog_producer_mock.produce.call_args_list[1] + delete_changelog = changelog_producer_mock.produce.call_args_list[0] assert delete_changelog.kwargs["key"] == tx._serialize_key( key=key, prefix=prefix ) assert delete_changelog.kwargs["value"] is None assert delete_changelog.kwargs["headers"] == {CHANGELOG_CF_MESSAGE_HEADER: cf} - def test_transaction_with_changelog_delete_cached( + def test_set_delete_and_prepare( self, rocksdb_partition_factory, changelog_producer_mock ): """ @@ -476,13 +508,13 @@ def test_transaction_with_changelog_delete_cached( with rocksdb_partition_factory( changelog_producer=changelog_producer_mock ) as partition: + tx = partition.begin() + tx.set(key=key, value=value, cf_name=cf, prefix=prefix) + tx.delete(key=key, cf_name=cf, prefix=prefix) - assert partition.get_changelog_offset() is None - - with partition.begin() as tx: - tx.set(key=key, value=value, cf_name=cf, prefix=prefix) - tx.delete(key=key, cf_name=cf, prefix=prefix) + tx.prepare() + assert tx.prepared assert changelog_producer_mock.produce.call_count == 1 delete_changelog = changelog_producer_mock.produce.call_args_list[0] assert delete_changelog.kwargs["key"] == tx._serialize_key( @@ -492,30 +524,3 @@ def test_transaction_with_changelog_delete_cached( assert delete_changelog.kwargs["headers"] == { CHANGELOG_CF_MESSAGE_HEADER: cf } - - assert tx.completed - assert partition.get_changelog_offset() == 1 - - def test_transaction_with_changelog_delete_nonexisting_key( - self, rocksdb_partition_factory, changelog_producer_mock - ): - key = "key" - cf = "default" - prefix = b"__key__" - - with rocksdb_partition_factory( - changelog_producer=changelog_producer_mock - ) as partition: - - assert partition.get_changelog_offset() is None - - with partition.begin() as tx: - tx.delete(key=key, cf_name=cf, prefix=prefix) - assert tx.completed - assert partition.get_changelog_offset() == 1 - - changelog_producer_mock.produce.assert_called_with( - key=tx._serialize_key(key=key, prefix=prefix), - value=None, - headers={CHANGELOG_CF_MESSAGE_HEADER: cf}, - ) diff --git a/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/fixtures.py b/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/fixtures.py index 727d8b31a..0fc5618dd 100644 --- a/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/fixtures.py +++ b/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/fixtures.py @@ -5,7 +5,9 @@ import pytest from quixstreams.rowproducer import RowProducer -from quixstreams.state.recovery import ChangelogProducerFactory +from quixstreams.state.recovery import ChangelogProducerFactory, ChangelogProducer +from quixstreams.state.rocksdb import RocksDBOptions +from quixstreams.state.rocksdb.windowed.partition import WindowedRocksDBStorePartition from quixstreams.state.rocksdb.windowed.store import WindowedRocksDBStore @@ -25,6 +27,28 @@ def factory( return factory +@pytest.fixture() +def windowed_rocksdb_partition_factory(tmp_path): + def factory( + name: str = "db", + options: Optional[RocksDBOptions] = None, + changelog_producer: Optional[ChangelogProducer] = None, + ) -> WindowedRocksDBStorePartition: + path = (tmp_path / name).as_posix() + _options = options or RocksDBOptions(open_max_retries=0, open_retry_backoff=3.0) + if not changelog_producer: + changelog_producer = create_autospec(ChangelogProducer)( + "topic", "partition", "producer" + ) + return WindowedRocksDBStorePartition( + path, + changelog_producer=changelog_producer, + options=_options, + ) + + return factory + + @pytest.fixture() def windowed_rocksdb_store_factory_changelog(tmp_path): def factory( diff --git a/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/test_transaction.py b/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/test_transaction.py index af3f0c507..2b07a0539 100644 --- a/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/test_transaction.py +++ b/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/test_transaction.py @@ -1,15 +1,6 @@ -from unittest.mock import call - import pytest -from quixstreams.state.rocksdb.metadata import ( - CHANGELOG_CF_MESSAGE_HEADER, - PREFIX_SEPARATOR, -) -from quixstreams.state.rocksdb.windowed.metadata import ( - LATEST_EXPIRED_WINDOW_CF_NAME, - LATEST_EXPIRED_WINDOW_TIMESTAMP_KEY, -) +from quixstreams.state.rocksdb.metadata import CHANGELOG_CF_MESSAGE_HEADER from quixstreams.state.rocksdb.windowed.serialization import encode_window_key @@ -319,23 +310,18 @@ def test_get_latest_timestamp_cannot_go_backwards( with partition.begin() as tx: assert tx.get_latest_timestamp() == timestamp - -class TestWindowedRocksDBPartitionTransactionChangelog: - def test_update_window(self, windowed_rocksdb_store_factory_changelog): - store = windowed_rocksdb_store_factory_changelog() - partition_num = 0 - store_partition = store.assign_partition(partition_num) - producer = store_partition._changelog_producer._producer + def test_update_window_and_prepare( + self, windowed_rocksdb_partition_factory, changelog_producer_mock + ): prefix = b"__key__" start_ms = 0 end_ms = 10 value = 1 - with store.start_partition_transaction(partition_num) as tx: - expected_produced_key = tx._serialize_key( - encode_window_key(start_ms, end_ms), prefix=prefix - ) - expected_produced_value = tx._serialize_value(value) + with windowed_rocksdb_partition_factory( + changelog_producer=changelog_producer_mock + ) as store_partition: + tx = store_partition.begin() tx.update_window( start_ms=start_ms, end_ms=end_ms, @@ -343,231 +329,42 @@ def test_update_window(self, windowed_rocksdb_store_factory_changelog): timestamp_ms=2, prefix=prefix, ) - assert ( - tx.get_window(start_ms=start_ms, end_ms=end_ms, prefix=prefix) == value - ) + tx.prepare() + assert tx.prepared - with store.start_partition_transaction(partition_num) as tx: - assert ( - tx.get_window(start_ms=start_ms, end_ms=end_ms, prefix=prefix) == value - ) - - assert ( - store_partition.get_changelog_offset() == producer.produce.call_count == 1 + assert changelog_producer_mock.produce.call_count == 1 + expected_produced_key = tx._serialize_key( + encode_window_key(start_ms, end_ms), prefix=prefix ) - producer.produce.assert_called_with( + expected_produced_value = tx._serialize_value(value) + changelog_producer_mock.produce.assert_called_with( key=expected_produced_key, value=expected_produced_value, headers={CHANGELOG_CF_MESSAGE_HEADER: "default"}, - topic=store_partition._changelog_producer._changelog_name, - partition=store_partition._changelog_producer._partition_num, ) - def test_delete_window(self, windowed_rocksdb_store_factory_changelog): - store = windowed_rocksdb_store_factory_changelog() - partition_num = 0 - store_partition = store.assign_partition(partition_num) - producer = store_partition._changelog_producer._producer - prefix = b"__key__" - expected_produced_value = None - start_ms = 0 - end_ms = 10 + def test_delete_window_and_prepare( + self, windowed_rocksdb_partition_factory, changelog_producer_mock + ): + with windowed_rocksdb_partition_factory( + changelog_producer=changelog_producer_mock + ) as store_partition: - with store.start_partition_transaction(partition_num) as tx: - expected_produced_key = tx._serialize_key( - encode_window_key(start_ms, end_ms), prefix=prefix - ) - tx.update_window( - start_ms=start_ms, end_ms=end_ms, value=1, timestamp_ms=1, prefix=prefix - ) - assert tx.get_window(start_ms=start_ms, end_ms=end_ms, prefix=prefix) == 1 - tx.delete_window(start_ms=start_ms, end_ms=end_ms, prefix=prefix) + prefix = b"__key__" + start_ms = 0 + end_ms = 10 - with store.start_partition_transaction(partition_num) as tx: - assert ( - tx.get_window(start_ms=start_ms, end_ms=end_ms, prefix=prefix) - is expected_produced_value - ) + tx = store_partition.begin() + tx.delete_window(start_ms=start_ms, end_ms=end_ms, prefix=prefix) + tx.prepare() + assert tx.prepared - assert ( - store_partition.get_changelog_offset() == producer.produce.call_count == 1 + assert changelog_producer_mock.produce.call_count == 1 + expected_produced_key = tx._serialize_key( + encode_window_key(start_ms, end_ms), prefix=prefix ) - producer.produce.assert_called_with( + changelog_producer_mock.produce.assert_called_with( key=expected_produced_key, - value=expected_produced_value, + value=None, headers={CHANGELOG_CF_MESSAGE_HEADER: "default"}, - topic=store_partition._changelog_producer._changelog_name, - partition=store_partition._changelog_producer._partition_num, ) - - def test_expire_windows_expired(self, windowed_rocksdb_store_factory_changelog): - store = windowed_rocksdb_store_factory_changelog() - partition_num = 0 - store_partition = store.assign_partition(partition_num) - producer = store_partition._changelog_producer._producer - prefix = b"__key__" - expected_update_produce_keys = [] - expected_update_produce_values = [] - expected_expired_window_keys = [] - expected_expired_windows = [ - dict(start_ms=0, end_ms=10, value=1, timestamp_ms=2), - dict(start_ms=10, end_ms=20, value=2, timestamp_ms=10), - ] - - # update windows, which will become expired later - with store.start_partition_transaction(partition_num) as tx: - for kwargs in expected_expired_windows: - serialized_key = tx._serialize_key( - encode_window_key(kwargs["start_ms"], kwargs["end_ms"]), - prefix=prefix, - ) - expected_update_produce_keys.append(serialized_key) - expected_expired_window_keys.append(serialized_key) - expected_update_produce_values.append( - tx._serialize_value(kwargs["value"]) - ) - tx.update_window(**kwargs, prefix=prefix) - - # add new window update, which expires previous windows - with store.start_partition_transaction(partition_num) as tx: - kwargs = dict(start_ms=20, end_ms=30, value=3, timestamp_ms=20) - expected_update_produce_keys.append( - tx._serialize_key( - encode_window_key(kwargs["start_ms"], kwargs["end_ms"]), - prefix=prefix, - ) - ) - expected_update_produce_values.append(tx._serialize_value(kwargs["value"])) - tx.update_window(**kwargs, prefix=prefix) - expired = tx.expire_windows(duration_ms=10, prefix=prefix) - # "expire_windows" must update the expiration index so that the same - # windows are not expired twice - assert not tx.expire_windows(duration_ms=10, prefix=prefix) - - assert expired == [ - ((w["start_ms"], w["end_ms"]), w["value"]) for w in expected_expired_windows - ] - - produce_calls = [ - call( - key=k, - value=v, - headers={CHANGELOG_CF_MESSAGE_HEADER: "default"}, - topic=store_partition._changelog_producer._changelog_name, - partition=store_partition._changelog_producer._partition_num, - ) - for k, v in zip( - expected_update_produce_keys, expected_update_produce_values - ) - ] - - produce_calls.extend( - [ - call( - key=k, - value=None, - headers={CHANGELOG_CF_MESSAGE_HEADER: "default"}, - topic=store_partition._changelog_producer._changelog_name, - partition=store_partition._changelog_producer._partition_num, - ) - for k in expected_expired_window_keys - ] - ) - - produce_calls.append( - call( - key=prefix + PREFIX_SEPARATOR + LATEST_EXPIRED_WINDOW_TIMESTAMP_KEY, - value=str(expected_expired_windows[-1]["start_ms"]).encode(), - headers={CHANGELOG_CF_MESSAGE_HEADER: LATEST_EXPIRED_WINDOW_CF_NAME}, - topic=store_partition._changelog_producer._changelog_name, - partition=store_partition._changelog_producer._partition_num, - ) - ) - - producer.produce.assert_has_calls(produce_calls) - assert producer.produce.call_count == len(produce_calls) - - with store.start_partition_transaction(0) as tx: - prefix = b"__key__" - assert tx.get_window(start_ms=0, end_ms=10, prefix=prefix) is None - assert tx.get_window(start_ms=10, end_ms=20, prefix=prefix) is None - assert tx.get_window(start_ms=20, end_ms=30, prefix=prefix) == 3 - - def test_expire_windows_cached(self, windowed_rocksdb_store_factory_changelog): - """ - Check that windows expire correctly even if they're not committed to the DB - yet. - - Consequently, only the end result of a window should be produced to the - changelog topic, not every update. - """ - store = windowed_rocksdb_store_factory_changelog() - partition_num = 0 - store_partition = store.assign_partition(partition_num) - producer = store_partition._changelog_producer._producer - key = b"__key__" - - expected_update_produce_keys = [] - expected_update_produce_values = [] - update_windows = [ - dict(start_ms=0, end_ms=10, value=1, timestamp_ms=2), - dict(start_ms=10, end_ms=20, value=2, timestamp_ms=10), - dict(start_ms=20, end_ms=30, value=3, timestamp_ms=20), - ] - expected_expired_windows = update_windows[:2] - - with store.start_partition_transaction(0) as tx: - - for kwargs in update_windows: - serialized_key = tx._serialize_key( - encode_window_key(kwargs["start_ms"], kwargs["end_ms"]), - prefix=key, - ) - tx.update_window(**kwargs, prefix=key) - expected_update_produce_keys.append(serialized_key) - if kwargs in expected_expired_windows: - expected_update_produce_values.append(None) - else: - expected_update_produce_values.append( - tx._serialize_value(kwargs["value"]) - ) - - expired = tx.expire_windows(duration_ms=10, prefix=key) - # "expire_windows" must update the expiration index so that the same - # windows are not expired twice - assert not tx.expire_windows(duration_ms=10, prefix=key) - - assert expired == [ - ((w["start_ms"], w["end_ms"]), w["value"]) for w in expected_expired_windows - ] - - produce_calls = [ - call( - key=k, - value=v, - headers={CHANGELOG_CF_MESSAGE_HEADER: "default"}, - topic=store_partition._changelog_producer._changelog_name, - partition=store_partition._changelog_producer._partition_num, - ) - for k, v in zip( - expected_update_produce_keys, expected_update_produce_values - ) - ] - - produce_calls.append( - call( - key=key + PREFIX_SEPARATOR + LATEST_EXPIRED_WINDOW_TIMESTAMP_KEY, - value=str(expected_expired_windows[-1]["start_ms"]).encode(), - headers={CHANGELOG_CF_MESSAGE_HEADER: LATEST_EXPIRED_WINDOW_CF_NAME}, - topic=store_partition._changelog_producer._changelog_name, - partition=store_partition._changelog_producer._partition_num, - ) - ) - - producer.produce.assert_has_calls(produce_calls) - assert producer.produce.call_count == len(produce_calls) - - with store.start_partition_transaction(0) as tx: - assert tx.get_window(start_ms=0, end_ms=10, prefix=key) is None - assert tx.get_window(start_ms=10, end_ms=20, prefix=key) is None - assert tx.get_window(start_ms=20, end_ms=30, prefix=key) == 3 From 00df6c3a0cb20fd0f097ae097e02c07394ffb8f4 Mon Sep 17 00:00:00 2001 From: Daniil Gusev Date: Tue, 23 Apr 2024 11:47:59 +0200 Subject: [PATCH 06/28] Separate exceptions for RowProducer and RowConsumer --- quixstreams/kafka/exceptions.py | 8 +++++++- quixstreams/rowconsumer.py | 4 ++-- quixstreams/rowproducer.py | 4 ++-- tests/test_quixstreams/test_app.py | 4 ++-- tests/test_quixstreams/test_rowconsumer.py | 8 ++++---- tests/test_quixstreams/test_rowproducer.py | 6 +++--- 6 files changed, 20 insertions(+), 14 deletions(-) diff --git a/quixstreams/kafka/exceptions.py b/quixstreams/kafka/exceptions.py index 70c3feab4..ba9c71ce7 100644 --- a/quixstreams/kafka/exceptions.py +++ b/quixstreams/kafka/exceptions.py @@ -3,7 +3,7 @@ from quixstreams.exceptions import QuixException -class KafkaException(QuixException): +class BaseKafkaException(QuixException): def __init__(self, error: KafkaError): self.error = error @@ -24,3 +24,9 @@ def __str__(self): def __repr__(self): return str(self) + + +class KafkaConsumerException(BaseKafkaException): ... + + +class KafkaProducerDeliveryError(BaseKafkaException): ... diff --git a/quixstreams/rowconsumer.py b/quixstreams/rowconsumer.py index 977e4e3e4..21a99fe03 100644 --- a/quixstreams/rowconsumer.py +++ b/quixstreams/rowconsumer.py @@ -7,7 +7,7 @@ from .exceptions import PartitionAssignmentError from .kafka import Consumer, AssignmentStrategy, AutoOffsetReset from .kafka.consumer import RebalancingCallback -from .kafka.exceptions import KafkaException +from .kafka.exceptions import KafkaConsumerException from .models import Topic, Row from .models.serializers.exceptions import IgnoreMessage @@ -134,7 +134,7 @@ def poll_row(self, timeout: float = None) -> Union[Row, List[Row], None]: topic_name, partition, offset = msg.topic(), msg.partition(), msg.offset() try: if msg.error(): - raise KafkaException(error=msg.error()) + raise KafkaConsumerException(error=msg.error()) topic = self._topics[topic_name] diff --git a/quixstreams/rowproducer.py b/quixstreams/rowproducer.py index 903bbc261..76322ac69 100644 --- a/quixstreams/rowproducer.py +++ b/quixstreams/rowproducer.py @@ -4,7 +4,7 @@ from confluent_kafka import KafkaError, Message from .error_callbacks import ProducerErrorCallback, default_on_producer_error -from .kafka.exceptions import KafkaException +from .kafka.exceptions import KafkaProducerDeliveryError from .kafka.producer import Producer, Partitioner from .models import Topic, Row, Headers @@ -143,7 +143,7 @@ def _on_delivery(self, err: Optional[KafkaError], msg: Message): def _raise_for_error(self): if self._error is not None: - exc = KafkaException(self._error) + exc = KafkaProducerDeliveryError(self._error) self._error = None raise exc diff --git a/tests/test_quixstreams/test_app.py b/tests/test_quixstreams/test_app.py index 9c1edcf3f..5d9a2da2e 100644 --- a/tests/test_quixstreams/test_app.py +++ b/tests/test_quixstreams/test_app.py @@ -12,7 +12,7 @@ from quixstreams.app import Application from quixstreams.dataframe import StreamingDataFrame from quixstreams.dataframe.windows.base import get_window_ranges -from quixstreams.kafka.exceptions import KafkaException +from quixstreams.kafka.exceptions import KafkaConsumerException from quixstreams.models import ( DoubleDeserializer, DoubleSerializer, @@ -182,7 +182,7 @@ def test_run_consumer_error_raised(self, app_factory, executor): # Stop app after 10s if nothing failed executor.submit(_stop_app_on_timeout, app, 10.0) - with pytest.raises(KafkaException): + with pytest.raises(KafkaConsumerException): app.run(sdf) def test_run_deserialization_error_raised(self, app_factory, executor): diff --git a/tests/test_quixstreams/test_rowconsumer.py b/tests/test_quixstreams/test_rowconsumer.py index f4dcfec44..834050778 100644 --- a/tests/test_quixstreams/test_rowconsumer.py +++ b/tests/test_quixstreams/test_rowconsumer.py @@ -7,7 +7,7 @@ IgnoreMessage, SerializationError, ) -from quixstreams.kafka.exceptions import KafkaException +from quixstreams.kafka.exceptions import KafkaConsumerException from tests.utils import Timeout @@ -61,7 +61,7 @@ def test_poll_row_kafka_error( auto_offset_reset="earliest", ) as consumer: consumer.subscribe([topic]) - with pytest.raises(KafkaException) as raised: + with pytest.raises(KafkaConsumerException) as raised: consumer.poll_row(10.0) exc = raised.value assert exc.code == KafkaError.UNKNOWN_TOPIC_OR_PART @@ -112,7 +112,7 @@ def test_poll_row_kafka_error_raise( producer.produce(topic.name, key=b"key", value=b"value") producer.flush() consumer.subscribe([topic]) - with pytest.raises(KafkaException): + with pytest.raises(KafkaConsumerException): consumer.poll_row(10.0) def test_poll_row_deserialization_error_suppress( @@ -147,7 +147,7 @@ def test_poll_row_kafka_error_suppress( suppressed = False def on_error(exc, *args): - assert isinstance(exc, KafkaException) + assert isinstance(exc, KafkaConsumerException) nonlocal suppressed suppressed = True return True diff --git a/tests/test_quixstreams/test_rowproducer.py b/tests/test_quixstreams/test_rowproducer.py index 59f15b09d..ab0e03e27 100644 --- a/tests/test_quixstreams/test_rowproducer.py +++ b/tests/test_quixstreams/test_rowproducer.py @@ -2,7 +2,7 @@ import pytest from confluent_kafka import KafkaException as ConfluentKafkaException -from quixstreams.kafka.exceptions import KafkaException +from quixstreams.kafka.exceptions import KafkaProducerDeliveryError from quixstreams.models import ( JSONSerializer, @@ -142,7 +142,7 @@ def test_produce_delivery_error_raised_on_produce( # Poll for delivery callbacks producer.poll(5) # The next produce should fail after - with pytest.raises(KafkaException): + with pytest.raises(KafkaProducerDeliveryError): producer.produce(topic=topic.name, key=key, value=value) def test_produce_delivery_error_raised_on_flush( @@ -158,5 +158,5 @@ def test_produce_delivery_error_raised_on_flush( # in the delivery callback producer.produce(topic=topic.name, key=key, value=value, partition=3) # The flush should fail after that - with pytest.raises(KafkaException): + with pytest.raises(KafkaProducerDeliveryError): producer.flush() From f1a7b19366fd74d68c526737d97c7b7592e35265 Mon Sep 17 00:00:00 2001 From: Daniil Gusev Date: Wed, 24 Apr 2024 13:45:43 +0200 Subject: [PATCH 07/28] Fix failing Application tests --- tests/test_quixstreams/test_app.py | 229 ++++++++++++++++++----------- 1 file changed, 141 insertions(+), 88 deletions(-) diff --git a/tests/test_quixstreams/test_app.py b/tests/test_quixstreams/test_app.py index 5d9a2da2e..366c4aab1 100644 --- a/tests/test_quixstreams/test_app.py +++ b/tests/test_quixstreams/test_app.py @@ -473,120 +473,159 @@ def test_consumer_group_default(self): class TestQuixApplication: def test_init_with_quix_sdk_token_arg(self): - def cfg(): - return { - "sasl.mechanisms": "SCRAM-SHA-256", - "security.protocol": "SASL_SSL", - "bootstrap.servers": "address1,address2", - "sasl.username": "my-username", - "sasl.password": "my-password", - "ssl.ca.location": "/mock/dir/ca.cert", - } - consumer_group = "c_group" expected_workspace_cgroup = f"my_ws-{consumer_group}" quix_sdk_token = "my_sdk_token" + broker_address = "address1,address2" + + extra_config = {"extra": "config"} + auth_params = { + "sasl.mechanisms": "SCRAM-SHA-256", + "security.protocol": "SASL_SSL", + "sasl.username": "my-username", + "sasl.password": "my-password", + "ssl.ca.location": "/mock/dir/ca.cert", + } + confluent_broker_config = { + **auth_params, + "bootstrap.servers": broker_address, + } + expected_extra_config = {**auth_params, **extra_config} def get_cfg_builder(quix_sdk_token): cfg_builder = create_autospec(QuixKafkaConfigsBuilder) - cfg_builder.get_confluent_broker_config.side_effect = cfg + cfg_builder.get_confluent_broker_config.return_value = ( + confluent_broker_config + ) cfg_builder.prepend_workspace_id.return_value = expected_workspace_cgroup cfg_builder.quix_sdk_token = quix_sdk_token return cfg_builder - with patch("quixstreams.app.QuixKafkaConfigsBuilder", get_cfg_builder): - app = Application( + # Mock consumer and producer to check the init args + with patch("quixstreams.app.QuixKafkaConfigsBuilder", get_cfg_builder), patch( + "quixstreams.app.RowConsumer" + ) as consumer_init_mock, patch( + "quixstreams.app.RowProducer" + ) as producer_init_mock: + Application( consumer_group=consumer_group, quix_sdk_token=quix_sdk_token, - consumer_extra_config={"extra": "config"}, - producer_extra_config={"extra": "config"}, + consumer_extra_config=extra_config, + producer_extra_config=extra_config, ) # Check if items from the Quix config have been passed # to the low-level configs of producer and consumer - assert cfg().items() <= app._producer._producer_config.items() - assert cfg().items() <= app._consumer._consumer_config.items() - - assert app._producer._producer_config["extra"] == "config" - assert app._consumer._consumer_config["extra"] == "config" - assert app._consumer._consumer_config["group.id"] == expected_workspace_cgroup - - def test_init_with_quix_sdk_token_env(self): - def cfg(): - return { - "sasl.mechanisms": "SCRAM-SHA-256", - "security.protocol": "SASL_SSL", - "bootstrap.servers": "address1,address2", - "sasl.username": "my-username", - "sasl.password": "my-password", - "ssl.ca.location": "/mock/dir/ca.cert", - } + producer_call_kwargs = producer_init_mock.call_args.kwargs + assert producer_call_kwargs["broker_address"] == broker_address + assert producer_call_kwargs["extra_config"] == expected_extra_config + + consumer_call_kwargs = consumer_init_mock.call_args.kwargs + assert consumer_call_kwargs["broker_address"] == broker_address + assert consumer_call_kwargs["consumer_group"] == expected_workspace_cgroup + assert consumer_call_kwargs["extra_config"] == expected_extra_config + def test_init_with_quix_sdk_token_env(self, monkeypatch): consumer_group = "c_group" expected_workspace_cgroup = f"my_ws-{consumer_group}" quix_sdk_token = "my_sdk_token" + broker_address = "address1,address2" + + extra_config = {"extra": "config"} + auth_params = { + "sasl.mechanisms": "SCRAM-SHA-256", + "security.protocol": "SASL_SSL", + "sasl.username": "my-username", + "sasl.password": "my-password", + "ssl.ca.location": "/mock/dir/ca.cert", + } + confluent_broker_config = { + **auth_params, + "bootstrap.servers": broker_address, + } + expected_extra_config = {**auth_params, **extra_config} def get_cfg_builder(quix_sdk_token): cfg_builder = create_autospec(QuixKafkaConfigsBuilder) - cfg_builder.get_confluent_broker_config.side_effect = cfg + cfg_builder.get_confluent_broker_config.return_value = ( + confluent_broker_config + ) cfg_builder.prepend_workspace_id.return_value = expected_workspace_cgroup cfg_builder.quix_sdk_token = quix_sdk_token return cfg_builder - with patch.dict(os.environ, {"Quix__Sdk__Token": quix_sdk_token}): - with patch("quixstreams.app.QuixKafkaConfigsBuilder", get_cfg_builder): - app = Application( - consumer_group=consumer_group, - consumer_extra_config={"extra": "config"}, - producer_extra_config={"extra": "config"}, - ) + monkeypatch.setenv("Quix__Sdk__Token", quix_sdk_token) + with patch("quixstreams.app.QuixKafkaConfigsBuilder", get_cfg_builder), patch( + "quixstreams.app.RowConsumer" + ) as consumer_init_mock, patch( + "quixstreams.app.RowProducer" + ) as producer_init_mock: + Application( + consumer_group=consumer_group, + consumer_extra_config=extra_config, + producer_extra_config=extra_config, + ) # Check if items from the Quix config have been passed # to the low-level configs of producer and consumer - assert cfg().items() <= app._producer._producer_config.items() - assert cfg().items() <= app._consumer._consumer_config.items() + producer_call_kwargs = producer_init_mock.call_args.kwargs + assert producer_call_kwargs["broker_address"] == broker_address + assert producer_call_kwargs["extra_config"] == expected_extra_config - assert app._producer._producer_config["extra"] == "config" - assert app._consumer._consumer_config["extra"] == "config" - assert app._consumer._consumer_config["group.id"] == expected_workspace_cgroup + consumer_call_kwargs = consumer_init_mock.call_args.kwargs + assert consumer_call_kwargs["broker_address"] == broker_address + assert consumer_call_kwargs["consumer_group"] == expected_workspace_cgroup + assert consumer_call_kwargs["extra_config"] == expected_extra_config def test_init_with_quix_config_builder(self): - def cfg(): - return { - "sasl.mechanisms": "SCRAM-SHA-256", - "security.protocol": "SASL_SSL", - "bootstrap.servers": "address1,address2", - "sasl.username": "my-username", - "sasl.password": "my-password", - "ssl.ca.location": "/mock/dir/ca.cert", - } - consumer_group = "c_group" expected_workspace_cgroup = f"my_ws-{consumer_group}" quix_sdk_token = "my_sdk_token" + broker_address = "address1,address2" + + extra_config = {"extra": "config"} + auth_params = { + "sasl.mechanisms": "SCRAM-SHA-256", + "security.protocol": "SASL_SSL", + "sasl.username": "my-username", + "sasl.password": "my-password", + "ssl.ca.location": "/mock/dir/ca.cert", + } + confluent_broker_config = { + **auth_params, + "bootstrap.servers": broker_address, + } + expected_extra_config = {**auth_params, **extra_config} def get_cfg_builder(quix_sdk_token): cfg_builder = create_autospec(QuixKafkaConfigsBuilder) - cfg_builder.get_confluent_broker_config.side_effect = cfg + cfg_builder.get_confluent_broker_config.return_value = ( + confluent_broker_config + ) cfg_builder.prepend_workspace_id.return_value = expected_workspace_cgroup cfg_builder.quix_sdk_token = quix_sdk_token return cfg_builder - app = Application( - consumer_group=consumer_group, - quix_config_builder=get_cfg_builder(quix_sdk_token), - consumer_extra_config={"extra": "config"}, - producer_extra_config={"extra": "config"}, - ) + with patch("quixstreams.app.RowConsumer") as consumer_init_mock, patch( + "quixstreams.app.RowProducer" + ) as producer_init_mock: + Application( + consumer_group=consumer_group, + quix_config_builder=get_cfg_builder(quix_sdk_token), + consumer_extra_config={"extra": "config"}, + producer_extra_config={"extra": "config"}, + ) # Check if items from the Quix config have been passed # to the low-level configs of producer and consumer - assert cfg().items() <= app._producer._producer_config.items() - assert cfg().items() <= app._consumer._consumer_config.items() + producer_call_kwargs = producer_init_mock.call_args.kwargs + assert producer_call_kwargs["broker_address"] == broker_address + assert producer_call_kwargs["extra_config"] == expected_extra_config - assert app._producer._producer_config["extra"] == "config" - assert app._consumer._consumer_config["extra"] == "config" - assert app._consumer._consumer_config["group.id"] == expected_workspace_cgroup + consumer_call_kwargs = consumer_init_mock.call_args.kwargs + assert consumer_call_kwargs["broker_address"] == broker_address + assert consumer_call_kwargs["consumer_group"] == expected_workspace_cgroup + assert consumer_call_kwargs["extra_config"] == expected_extra_config def test_init_with_broker_id_raises(self): with pytest.raises(ValueError) as e_info: @@ -679,35 +718,49 @@ class TestDeprecatedApplicationDotQuix: """ def test_init(self): - def cfg(): - return { - "sasl.mechanisms": "SCRAM-SHA-256", - "security.protocol": "SASL_SSL", - "bootstrap.servers": "address1,address2", - "sasl.username": "my-username", - "sasl.password": "my-password", - "ssl.ca.location": "/mock/dir/ca.cert", - } + consumer_group = "c_group" + expected_workspace_cgroup = f"my_ws-{consumer_group}" + broker_address = "address1,address2" + + extra_config = {"extra": "config"} + auth_params = { + "sasl.mechanisms": "SCRAM-SHA-256", + "security.protocol": "SASL_SSL", + "sasl.username": "my-username", + "sasl.password": "my-password", + "ssl.ca.location": "/mock/dir/ca.cert", + } + confluent_broker_config = { + **auth_params, + "bootstrap.servers": broker_address, + } + expected_extra_config = {**auth_params, **extra_config} cfg_builder = create_autospec(QuixKafkaConfigsBuilder) - cfg_builder.get_confluent_broker_config.side_effect = cfg + cfg_builder.get_confluent_broker_config.return_value = confluent_broker_config cfg_builder.prepend_workspace_id.return_value = "my_ws-c_group" cfg_builder.strip_workspace_id_prefix.return_value = "c_group" - app = Application.Quix( - quix_config_builder=cfg_builder, - consumer_group="c_group", - consumer_extra_config={"extra": "config"}, - producer_extra_config={"extra": "config"}, - ) + with patch("quixstreams.app.RowConsumer") as consumer_init_mock, patch( + "quixstreams.app.RowProducer" + ) as producer_init_mock: + Application.Quix( + quix_config_builder=cfg_builder, + consumer_group="c_group", + consumer_extra_config={"extra": "config"}, + producer_extra_config={"extra": "config"}, + ) # Check if items from the Quix config have been passed # to the low-level configs of producer and consumer - assert cfg().items() <= app._producer._producer_config.items() - assert cfg().items() <= app._consumer._consumer_config.items() + producer_call_kwargs = producer_init_mock.call_args.kwargs + assert producer_call_kwargs["broker_address"] == broker_address + assert producer_call_kwargs["extra_config"] == expected_extra_config + + consumer_call_kwargs = consumer_init_mock.call_args.kwargs + assert consumer_call_kwargs["broker_address"] == broker_address + assert consumer_call_kwargs["consumer_group"] == expected_workspace_cgroup + assert consumer_call_kwargs["extra_config"] == expected_extra_config - assert app._producer._producer_config["extra"] == "config" - assert app._consumer._consumer_config["extra"] == "config" - assert app._consumer._consumer_config["group.id"] == "my_ws-c_group" cfg_builder.prepend_workspace_id.assert_called_with("c_group") def test_topic_name_and_config(self, app_dot_quix_factory): From 9ad15ddbfc5a58a2b95a3812e610394be314f1e2 Mon Sep 17 00:00:00 2001 From: Daniil Gusev Date: Wed, 24 Apr 2024 13:48:25 +0200 Subject: [PATCH 08/28] Move checkpoint to a module and add tests --- quixstreams/checkpointing/__init__.py | 2 + quixstreams/{ => checkpointing}/checkpoint.py | 96 +++-- quixstreams/checkpointing/exceptions.py | 4 + quixstreams/processing_context.py | 2 +- quixstreams/state/exceptions.py | 3 + tests/test_quixstreams/fixtures.py | 5 + tests/test_quixstreams/test_checkpoint.py | 166 --------- tests/test_quixstreams/test_checkpointing.py | 328 ++++++++++++++++++ 8 files changed, 403 insertions(+), 203 deletions(-) create mode 100644 quixstreams/checkpointing/__init__.py rename quixstreams/{ => checkpointing}/checkpoint.py (54%) create mode 100644 quixstreams/checkpointing/exceptions.py delete mode 100644 tests/test_quixstreams/test_checkpoint.py create mode 100644 tests/test_quixstreams/test_checkpointing.py diff --git a/quixstreams/checkpointing/__init__.py b/quixstreams/checkpointing/__init__.py new file mode 100644 index 000000000..92235318b --- /dev/null +++ b/quixstreams/checkpointing/__init__.py @@ -0,0 +1,2 @@ +from .checkpoint import Checkpoint as Checkpoint +from .exceptions import InvalidStoredOffset as InvalidStoredOffset diff --git a/quixstreams/checkpoint.py b/quixstreams/checkpointing/checkpoint.py similarity index 54% rename from quixstreams/checkpoint.py rename to quixstreams/checkpointing/checkpoint.py index baaa98df5..9d16164e2 100644 --- a/quixstreams/checkpoint.py +++ b/quixstreams/checkpointing/checkpoint.py @@ -4,17 +4,19 @@ from confluent_kafka import TopicPartition -from quixstreams.kafka import Consumer, Producer +from quixstreams.kafka import Consumer +from quixstreams.rowproducer import RowProducer from quixstreams.state import ( StateStoreManager, PartitionTransaction, DEFAULT_STATE_STORE_NAME, ) +from quixstreams.state.exceptions import StoreTransactionFailed +from .exceptions import InvalidStoredOffset logger = logging.getLogger(__name__) -# TODO: Tests class Checkpoint: """ Class to keep track of state updates and consumer offsets and to checkpoint these @@ -24,25 +26,22 @@ class Checkpoint: def __init__( self, commit_interval: float, - producer: Producer, + producer: RowProducer, consumer: Consumer, state_manager: StateStoreManager, ): self._created_at = time.monotonic() + # A mapping of <(topic, partition): processed offset> self._tp_offsets: Dict[Tuple[str, int], int] = {} # A mapping of <(topic, partition, store_name): PartitionTransaction> self._store_transactions: Dict[(str, int, str), PartitionTransaction] = {} - # Ensure the checkpoint is not negative. # Passing zero or lower will flush the checkpoint after each processed message self._commit_interval = max(commit_interval, 0) self._state_manager = state_manager self._consumer = consumer self._producer = producer - # TODO: Can the checkpoint object be reused? - # Do we need to validate that it can't? - def expired(self) -> bool: """ Returns `True` if checkpoint deadline has expired. @@ -64,6 +63,16 @@ def store_offset(self, topic: str, partition: int, offset: int): :param partition: partition number :param offset: message offset """ + stored_offset = self._tp_offsets.get((topic, partition), -1) + # A paranoid check to ensure that processed offsets always increase within the + # same checkpoint. + # It shouldn't normally happen, but a lot of logic relies on it, + # and it's better to be safe. + if offset < stored_offset: + raise InvalidStoredOffset( + f"Cannot store offset smaller or equal than already processed" + f" one: {offset} <= {stored_offset}" + ) self._tp_offsets[(topic, partition)] = offset def get_store_transaction( @@ -97,45 +106,60 @@ def commit(self): 1. Flush the changelogs for each state store and ensure everything is produced. 2. Commit topic offsets. 3. Flush each state store partition to the disk. - """ - # TODO: Error handling - - # 0. Produce the changelogs - # for ( - # topic, - # partition, - # store_name, - # ), transaction in self._store_transactions.items(): - # offset = self._tp_offsets[(topic, partition)] - # # TODO: Flush the changelogs. Call it "prepare"? - # if transaction.failed: - # raise - # transaction.prepare(offset=offset) - - # 1. Flush producer - # TODO: Check if all messages are flushed successfully - # TODO: Take the produced changelog offsets - # TODO: Logs + + if not self._tp_offsets: + # No messages have been processed during this checkpoint, return + return + + # Step 1. Produce the changelogs + for ( + topic, + partition, + store_name, + ), transaction in self._store_transactions.items(): + offset = self._tp_offsets[(topic, partition)] + if transaction.failed: + raise StoreTransactionFailed( + f'Detected a failed transaction for store "{store_name}", ' + f"the checkpoint is aborted" + ) + transaction.prepare(processed_offset=offset) + + # Step 2. Flush producer to trigger all delivery callbacks and ensure that + # all messages are produced self._producer.flush() + # Get produced offsets after flushing the producer + produced_offsets = self._producer.offsets - # 2. Commit offsets to Kafka + # Step 3. Commit offsets to Kafka offsets = [ TopicPartition(topic=topic, partition=partition, offset=offset + 1) for (topic, partition), offset in self._tp_offsets.items() ] - if offsets: - self._consumer.commit(offsets=offsets, asynchronous=False) + self._consumer.commit(offsets=offsets, asynchronous=False) - # 3. Flush state store partitions to the disk + # Step 4. Flush state store partitions to the disk together with changelog + # offsets for ( topic, partition, store_name, ), transaction in self._store_transactions.items(): - offset = self._tp_offsets.get((topic, partition)) - if offset is not None: - transaction.maybe_flush(offset=offset) - - # TODO: Remove when the new changelog producer is implemented - self._producer.flush() + offset = self._tp_offsets[(topic, partition)] + + # Get the changelog topic-partition for the given transaction + # It can be None if changelog topics are disabled in the app + changelog_tp = transaction.changelog_topic_partition + # The changelog offset also can be None if no updates happened + # during transaction + changelog_offset = ( + produced_offsets.get(changelog_tp) if changelog_tp is not None else None + ) + if changelog_offset is not None: + # Increment the changelog offset by one to match the high watermark + # in Kafka + changelog_offset += 1 + transaction.flush( + processed_offset=offset, changelog_offset=changelog_offset + ) diff --git a/quixstreams/checkpointing/exceptions.py b/quixstreams/checkpointing/exceptions.py new file mode 100644 index 000000000..fc0bff910 --- /dev/null +++ b/quixstreams/checkpointing/exceptions.py @@ -0,0 +1,4 @@ +from quixstreams.exceptions import QuixException + + +class InvalidStoredOffset(QuixException): ... diff --git a/quixstreams/processing_context.py b/quixstreams/processing_context.py index aae237309..84d049f3a 100644 --- a/quixstreams/processing_context.py +++ b/quixstreams/processing_context.py @@ -2,7 +2,7 @@ import logging from typing import Optional -from quixstreams.checkpoint import Checkpoint +from quixstreams.checkpointing import Checkpoint from quixstreams.exceptions import QuixException from quixstreams.rowconsumer import RowConsumer from quixstreams.rowproducer import RowProducer diff --git a/quixstreams/state/exceptions.py b/quixstreams/state/exceptions.py index ee1ba062b..7ec216213 100644 --- a/quixstreams/state/exceptions.py +++ b/quixstreams/state/exceptions.py @@ -14,3 +14,6 @@ class WindowedStoreAlreadyRegisteredError(QuixException): ... class InvalidStoreTransactionStateError(QuixException): ... + + +class StoreTransactionFailed(QuixException): ... diff --git a/tests/test_quixstreams/fixtures.py b/tests/test_quixstreams/fixtures.py index 30da3fc65..ab66f924a 100644 --- a/tests/test_quixstreams/fixtures.py +++ b/tests/test_quixstreams/fixtures.py @@ -240,6 +240,11 @@ def factory( return factory +@pytest.fixture() +def row_producer(row_producer_factory): + return row_producer_factory() + + @pytest.fixture() def row_factory(): """ diff --git a/tests/test_quixstreams/test_checkpoint.py b/tests/test_quixstreams/test_checkpoint.py deleted file mode 100644 index 6ee42251e..000000000 --- a/tests/test_quixstreams/test_checkpoint.py +++ /dev/null @@ -1,166 +0,0 @@ -import contextlib -from unittest.mock import patch - -import pytest - -from quixstreams.state.exceptions import ( - StoreNotRegisteredError, - InvalidStoreTransactionStateError, -) -from quixstreams.state.rocksdb import RocksDBPartitionTransaction -from tests.utils import TopicPartitionStub - - -@pytest.mark.skip("Checkpoint tests") -class TestCheckpoint: - def test_get_store_transaction_store_not_registered_fails(self, state_manager): - with pytest.raises(StoreNotRegisteredError): - with state_manager.start_store_transaction("topic", 0, 0): - ... - - def test_get_store_transaction_not_started(self, state_manager): - with pytest.raises(InvalidStoreTransactionStateError): - state_manager.get_store_transaction("store") - - def test_store_transaction_success(self, state_manager): - state_manager.register_store("topic", "store") - tp = TopicPartitionStub("topic", 0) - state_manager.on_partition_assign(tp) - - store = state_manager.get_store("topic", "store") - store_partition = store.partitions[0] - - assert store_partition.get_processed_offset() is None - - with state_manager.start_store_transaction("topic", partition=0, offset=1): - tx = state_manager.get_store_transaction("store") - tx.set("some_key", "some_value", prefix=b"__key__") - - state_manager.on_partition_assign(tp) - - store = state_manager.get_store("topic", "store") - store_partition = store.partitions[0] - - assert store_partition.get_processed_offset() == 1 - - def test_store_transaction_no_flush_if_partition_transaction_failed( - self, state_manager - ): - """ - Ensure that no PartitionTransactions are flushed to the DB if - any of them fails - """ - state_manager.register_store("topic", "store1") - state_manager.register_store("topic", "store2") - state_manager.on_partition_assign(TopicPartitionStub("topic", 0)) - store1 = state_manager.get_store("topic", "store1") - store2 = state_manager.get_store("topic", "store2") - - with state_manager.start_store_transaction("topic", partition=0, offset=1): - tx_store1 = state_manager.get_store_transaction("store1") - tx_store2 = state_manager.get_store_transaction("store2") - # Simulate exception in one of the transactions - with contextlib.suppress(ValueError), patch.object( - RocksDBPartitionTransaction, - "_serialize_key", - side_effect=ValueError("test"), - ): - tx_store1.set("some_key", "some_value") - tx_store2.set("some_key", "some_value") - - assert store1.partitions[0].get_processed_offset() is None - assert store2.partitions[0].get_processed_offset() is None - - def test_start_store_transaction_already_started(self, state_manager): - state_manager.register_store("topic", "store") - with state_manager.start_store_transaction("topic", partition=0, offset=0): - with pytest.raises(InvalidStoreTransactionStateError): - with state_manager.start_store_transaction( - "topic", partition=0, offset=0 - ): - ... - - def test_store_transaction_no_flush_on_exception(self, state_manager): - state_manager.register_store("topic", "store") - state_manager.on_partition_assign(TopicPartitionStub("topic", 0)) - store = state_manager.get_store("topic", "store") - - with contextlib.suppress(Exception): - with state_manager.start_store_transaction("topic", partition=0, offset=1): - tx = state_manager.get_store_transaction("store") - tx.set("some_key", "some_value") - raise ValueError() - - store_partition = store.partitions[0] - assert store_partition.get_processed_offset() is None - - -@pytest.mark.skip("Checkpoint tests") -class TestCheckpointChangelog: - def test_store_transaction_no_flush_on_exception( - self, - state_manager_changelogs, - ): - state_manager = state_manager_changelogs - recovery_manager = state_manager._recovery_manager - topic_manager = recovery_manager._topic_manager - producer = state_manager._producer - consumer = recovery_manager._consumer - - consumer.get_watermark_offsets.return_value = (0, 10) - topic_manager.topic(name="topic") - state_manager.register_store("topic", store_name="store") - state_manager.on_partition_assign(TopicPartitionStub("topic", 0)) - store = state_manager.get_store("topic", "store") - - with contextlib.suppress(Exception): - with state_manager.start_store_transaction("topic", partition=0, offset=1): - tx = state_manager.get_store_transaction("store") - tx.set("some_key", "some_value") - raise ValueError() - - store_partition = store.partitions[0] - assert store_partition.get_processed_offset() is None - assert store_partition.get_changelog_offset() is None - producer.produce.assert_not_called() - - def test_store_transaction_no_flush_if_partition_transaction_failed( - self, - state_manager_changelogs, - ): - """ - Ensure that no PartitionTransactions are flushed to the DB if - any of them fails - """ - state_manager = state_manager_changelogs - recovery_manager = state_manager._recovery_manager - topic_manager = recovery_manager._topic_manager - producer = state_manager._producer - consumer = recovery_manager._consumer - - consumer.get_watermark_offsets.return_value = (0, 10) - topic_manager.topic(name="topic") - state_manager.register_store("topic", store_name="store1") - state_manager.register_store("topic", store_name="store2") - state_manager.on_partition_assign(TopicPartitionStub("topic", 0)) - - store1 = state_manager.get_store("topic", "store1") - store2 = state_manager.get_store("topic", "store2") - - with state_manager.start_store_transaction("topic", partition=0, offset=1): - tx_store1 = state_manager.get_store_transaction("store1") - tx_store2 = state_manager.get_store_transaction("store2") - # Simulate exception in one of the transactions - with contextlib.suppress(ValueError), patch.object( - RocksDBPartitionTransaction, - "_serialize_key", - side_effect=ValueError("test"), - ): - tx_store1.set("some_key", "some_value") - tx_store2.set("some_key", "some_value") - - assert store1.partitions[0].get_processed_offset() is None - assert store1.partitions[0].get_changelog_offset() is None - assert store2.partitions[0].get_processed_offset() is None - assert store2.partitions[0].get_changelog_offset() is None - producer.produce.assert_not_called() diff --git a/tests/test_quixstreams/test_checkpointing.py b/tests/test_quixstreams/test_checkpointing.py new file mode 100644 index 000000000..f56fa19e1 --- /dev/null +++ b/tests/test_quixstreams/test_checkpointing.py @@ -0,0 +1,328 @@ +import contextlib +from typing import Optional +from unittest.mock import patch, MagicMock + +import pytest +from confluent_kafka import TopicPartition + +from quixstreams.checkpointing import Checkpoint, InvalidStoredOffset +from quixstreams.kafka import Consumer +from quixstreams.rowproducer import RowProducer +from quixstreams.state import StateStoreManager +from quixstreams.state.exceptions import StoreNotRegisteredError, StoreTransactionFailed +from quixstreams.state.rocksdb import RocksDBPartitionTransaction + + +@pytest.fixture() +def checkpoint_factory(state_manager, consumer, row_producer): + def factory( + commit_interval: float = 1, + consumer_: Optional[Consumer] = None, + producer_: Optional[RowProducer] = None, + state_manager_: Optional[StateStoreManager] = None, + ): + return Checkpoint( + commit_interval=commit_interval, + producer=producer_ or row_producer, + consumer=consumer_ or consumer, + state_manager=state_manager_ or state_manager, + ) + + return factory + + +class TestCheckpoint: + def test_empty_true(self, checkpoint_factory): + checkpoint = checkpoint_factory() + assert checkpoint.empty() + + def test_empty_false(self, checkpoint_factory): + checkpoint = checkpoint_factory() + checkpoint.store_offset("topic", 0, 0) + assert not checkpoint.empty() + + @pytest.mark.parametrize("commit_interval, expired", [(0, True), (999, False)]) + def test_expired(self, commit_interval, expired, checkpoint_factory): + checkpoint = checkpoint_factory(commit_interval=commit_interval) + assert checkpoint.expired() == expired + + def test_store_already_processed_offset_fails(self, checkpoint_factory): + checkpoint = checkpoint_factory() + checkpoint.store_offset("topic", 0, 10) + with pytest.raises(InvalidStoredOffset): + checkpoint.store_offset("topic", 0, 9) + + def test_commit_no_state_success( + self, checkpoint_factory, consumer, state_manager, topic_factory + ): + topic_name, _ = topic_factory() + checkpoint = checkpoint_factory( + consumer_=consumer, state_manager_=state_manager + ) + processed_offset = 999 + # Store the processed offset to simulate processing + checkpoint.store_offset(topic_name, 0, processed_offset) + + checkpoint.commit() + tp, *_ = consumer.committed([TopicPartition(topic=topic_name, partition=0)]) + assert tp.offset == processed_offset + 1 + + def test_commit_with_state_no_changelog_success( + self, checkpoint_factory, consumer, state_manager_factory, topic_factory + ): + topic_name, _ = topic_factory() + producer_mock = MagicMock(spec_set=RowProducer) + state_manager = state_manager_factory(producer=producer_mock) + checkpoint = checkpoint_factory( + consumer_=consumer, state_manager_=state_manager, producer_=producer_mock + ) + processed_offset = 999 + key, value, prefix = "key", "value", b"__key__" + state_manager.register_store(topic_name, "default") + store = state_manager.get_store(topic_name, "default") + store_partition = store.assign_partition(0) + + # Do some state updates and store the processed offset to simulate processing + tx = checkpoint.get_store_transaction(topic_name, 0) + tx.set(key=key, value=value, prefix=prefix) + checkpoint.store_offset(topic_name, 0, processed_offset) + + # Commit the checkpoint + checkpoint.commit() + + # Check the offset is committed + tp, *_ = consumer.committed([TopicPartition(topic=topic_name, partition=0)]) + assert tp.offset == processed_offset + 1 + + # Check the producer is flushed + assert producer_mock.flush.call_count == 1 + + # Check the state is flushed + assert tx.completed + new_tx = store.start_partition_transaction(0) + assert new_tx.get(key=key, prefix=prefix) == value + + # No changelogs should be flushed + assert not store_partition.get_changelog_offset() + # Processed offset should be stored + assert store_partition.get_processed_offset() == processed_offset + + def test_commit_with_state_with_changelog_success( + self, + checkpoint_factory, + row_producer, + consumer, + state_manager_factory, + recovery_manager_factory, + topic_factory, + ): + topic_name, _ = topic_factory() + recovery_manager = recovery_manager_factory(consumer=consumer) + state_manager = state_manager_factory( + producer=row_producer, recovery_manager=recovery_manager + ) + checkpoint = checkpoint_factory( + consumer_=consumer, state_manager_=state_manager, producer_=row_producer + ) + processed_offset = 999 + value, prefix = "value", b"__key__" + state_manager.register_store(topic_name, "default") + store = state_manager.get_store(topic_name, "default") + store_partition = store.assign_partition(0) + + # Do a couple of state updates to send more messages to the changelog + tx = checkpoint.get_store_transaction(topic_name, 0) + tx.set(key="key1", value=value, prefix=prefix) + tx.set(key="key2", value=value, prefix=prefix) + checkpoint.store_offset(topic_name, 0, processed_offset) + + # Commit the checkpoint + checkpoint.commit() + + # Check the state is flushed + assert tx.completed + + # Check the changelog offset + # The changelog offset must be equal to a number of updated keys + assert store_partition.get_changelog_offset() == 2 + assert store_partition.get_processed_offset() == 999 + + def test_commit_with_state_and_changelog_no_updates_success( + self, + checkpoint_factory, + row_producer, + consumer, + state_manager_factory, + recovery_manager_factory, + topic_factory, + ): + topic_name, _ = topic_factory() + recovery_manager = recovery_manager_factory(consumer=consumer) + state_manager = state_manager_factory( + producer=row_producer, recovery_manager=recovery_manager + ) + checkpoint = checkpoint_factory( + consumer_=consumer, state_manager_=state_manager, producer_=row_producer + ) + processed_offset = 999 + value, prefix = "value", b"__key__" + state_manager.register_store(topic_name, "default") + store = state_manager.get_store(topic_name, "default") + store_partition = store.assign_partition(0) + + # Create a transaction but don't update any keys + tx = checkpoint.get_store_transaction(topic_name, 0) + checkpoint.store_offset(topic_name, 0, processed_offset) + + # Commit the checkpoint + checkpoint.commit() + + # Check the transaction is not flushed + assert tx.completed + + # The changelog and processed offsets should be empty because no updates + # happend during the transaction + assert not store_partition.get_changelog_offset() + assert not store_partition.get_processed_offset() + + def test_commit_no_offsets_stored_noop( + self, checkpoint_factory, state_manager_factory, topic_factory + ): + topic_name, _ = topic_factory() + producer_mock = MagicMock(spec_set=RowProducer) + consumer_mock = MagicMock(spec_set=Consumer) + state_manager = state_manager_factory(producer=producer_mock) + checkpoint = checkpoint_factory( + consumer_=consumer_mock, + state_manager_=state_manager, + producer_=producer_mock, + ) + # Commit the checkpoint without processing any messages + checkpoint.commit() + + # Check nothing is committed + assert not consumer_mock.commit.call_count + assert not producer_mock.flush.call_count + + def test_commit_has_failed_transactions_fails( + self, checkpoint_factory, state_manager_factory, topic_factory + ): + producer_mock = MagicMock(spec_set=RowProducer) + consumer_mock = MagicMock(spec_set=Consumer) + state_manager = state_manager_factory(producer=producer_mock) + checkpoint = checkpoint_factory( + consumer_=consumer_mock, + state_manager_=state_manager, + producer_=producer_mock, + ) + processed_offset = 999 + key, value, prefix = "key", "value", b"__key__" + state_manager.register_store("topic", "default") + store = state_manager.get_store("topic", "default") + store.assign_partition(0) + + # Simulate a failed transaction + tx = checkpoint.get_store_transaction("topic", 0) + with contextlib.suppress(ValueError), patch.object( + RocksDBPartitionTransaction, + "_serialize_key", + side_effect=ValueError("test"), + ): + tx.set(key=key, value=value, prefix=prefix) + assert tx.failed + + # Store offset to simulate processing + checkpoint.store_offset("topic", 0, processed_offset) + + # Checkpoint commit should fail if any of the transaction is failed + # but the original exception was swallowed by an error callback + with pytest.raises(StoreTransactionFailed): + checkpoint.commit() + + # The producer should not flush + assert not producer_mock.flush.call_count + # Consumer should not commit + assert not consumer_mock.commit.call_count + + def test_commit_producer_flush_fails( + self, checkpoint_factory, state_manager_factory, topic_factory + ): + producer_mock = MagicMock(spec_set=RowProducer) + consumer_mock = MagicMock(spec_set=Consumer) + state_manager = state_manager_factory(producer=producer_mock) + checkpoint = checkpoint_factory( + consumer_=consumer_mock, + state_manager_=state_manager, + producer_=producer_mock, + ) + processed_offset = 999 + key, value, prefix = "key", "value", b"__key__" + state_manager.register_store("topic", "default") + store = state_manager.get_store("topic", "default") + store.assign_partition(0) + + # Do some state updates and store the processed offset to simulate processing + tx = checkpoint.get_store_transaction("topic", 0) + tx.set(key=key, value=value, prefix=prefix) + checkpoint.store_offset("topic", 0, processed_offset) + + producer_mock.flush.side_effect = ValueError("Flush failure") + # Checkpoint commit should fail if producer failed to flush + with pytest.raises(ValueError): + checkpoint.commit() + + # Consumer should not commit + assert not consumer_mock.commit.call_count + # The transaction should remain prepared, but not completed + assert tx.prepared + assert not tx.completed + + def test_commit_consumer_commit_fails( + self, checkpoint_factory, state_manager_factory, topic_factory + ): + producer_mock = MagicMock(spec_set=RowProducer) + consumer_mock = MagicMock(spec_set=Consumer) + state_manager = state_manager_factory(producer=producer_mock) + checkpoint = checkpoint_factory( + consumer_=consumer_mock, + state_manager_=state_manager, + producer_=producer_mock, + ) + processed_offset = 999 + key, value, prefix = "key", "value", b"__key__" + state_manager.register_store("topic", "default") + store = state_manager.get_store("topic", "default") + store.assign_partition(0) + + # Do some state updates and store the processed offset to simulate processing + tx = checkpoint.get_store_transaction("topic", 0) + tx.set(key=key, value=value, prefix=prefix) + checkpoint.store_offset("topic", 0, processed_offset) + + consumer_mock.commit.side_effect = ValueError("Commit failure") + # Checkpoint commit should fail if consumer failed to commit + with pytest.raises(ValueError): + checkpoint.commit() + + # Producer should flush + assert producer_mock.flush.call_count + # The transaction should remain prepared, but not completed + assert tx.prepared + assert not tx.completed + + def test_get_store_transaction_store_not_registered_fails(self, checkpoint_factory): + checkpoint = checkpoint_factory() + with pytest.raises(StoreNotRegisteredError): + with checkpoint.get_store_transaction("topic", 0, "default"): + ... + + def test_get_store_transaction_success(self, checkpoint_factory, state_manager): + state_manager.register_store("topic", "default") + store = state_manager.get_store("topic", "default") + store.assign_partition(0) + + checkpoint = checkpoint_factory(state_manager_=state_manager) + tx = checkpoint.get_store_transaction("topic", 0, "default") + assert tx + tx2 = checkpoint.get_store_transaction("topic", 0, "default") + assert tx2 is tx From 8a4ba0383455afec9c36ac42e1fb807671aca473 Mon Sep 17 00:00:00 2001 From: Daniil Gusev Date: Wed, 24 Apr 2024 13:51:15 +0200 Subject: [PATCH 09/28] Expose changelog name and partition on ChangelogProducer --- quixstreams/state/recovery.py | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/quixstreams/state/recovery.py b/quixstreams/state/recovery.py index daeb6301c..11e2bc2c9 100644 --- a/quixstreams/state/recovery.py +++ b/quixstreams/state/recovery.py @@ -4,7 +4,7 @@ from confluent_kafka import TopicPartition as ConfluentPartition from quixstreams.kafka import Consumer -from quixstreams.models import ConfluentKafkaMessageProto +from quixstreams.models import ConfluentKafkaMessageProto, Topic from quixstreams.models.topics import TopicManager from quixstreams.models.types import MessageHeadersMapping from quixstreams.rowproducer import RowProducer @@ -14,7 +14,12 @@ logger = logging.getLogger(__name__) -__all__ = ("ChangelogProducer", "ChangelogProducerFactory", "RecoveryManager") +__all__ = ( + "ChangelogProducer", + "ChangelogProducerFactory", + "RecoveryManager", + "RecoveryPartition", +) class RecoveryPartition: @@ -52,7 +57,7 @@ def needs_recovery(self): Determine whether recovery is necessary for underlying `StorePartition`. """ has_consumable_offsets = self._changelog_lowwater != self._changelog_highwater - state_is_behind = (self._changelog_highwater - self.offset) > 0 + state_is_behind = self._changelog_highwater > self.offset return has_consumable_offsets and state_is_behind @property @@ -62,7 +67,7 @@ def needs_offset_update(self): Usually checked during assign if recovery was not required. """ - return self._changelog_highwater and (self.offset != self._changelog_highwater) + return self._changelog_highwater and (self._changelog_highwater < self.offset) def update_offset(self): """ @@ -122,7 +127,7 @@ def __init__(self, changelog_name: str, producer: RowProducer): self._changelog_name = changelog_name self._producer = producer - def get_partition_producer(self, partition_num): + def get_partition_producer(self, partition_num) -> "ChangelogProducer": """ Generate a ChangelogProducer for producing to a specific partition number (and thus StorePartition). @@ -140,16 +145,24 @@ class ChangelogProducer: kafka changelog partition. """ - def __init__(self, changelog_name: str, partition_num: int, producer: RowProducer): + def __init__(self, changelog_name: str, partition: int, producer: RowProducer): """ :param changelog_name: A changelog topic name - :param partition_num: source topic partition number + :param partition: source topic partition number :param producer: a RowProducer (not shared with `Application` instance) """ self._changelog_name = changelog_name - self._partition_num = partition_num + self._partition_num = partition self._producer = producer + @property + def changelog_name(self) -> str: + return self._changelog_name + + @property + def partition(self) -> int: + return self._partition_num + def produce( self, key: bytes, @@ -211,7 +224,9 @@ def recovering(self) -> bool: """ return self.has_assignments and self._running - def register_changelog(self, topic_name: str, store_name: str, consumer_group: str): + def register_changelog( + self, topic_name: str, store_name: str, consumer_group: str + ) -> Topic: """ Register a changelog Topic with the TopicManager. From a7cffe44c9516ce5c272363c5c03797292b211d9 Mon Sep 17 00:00:00 2001 From: Daniil Gusev Date: Wed, 24 Apr 2024 13:51:43 +0200 Subject: [PATCH 10/28] Add missing recovery_manager_factory --- tests/test_quixstreams/test_state/fixtures.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tests/test_quixstreams/test_state/fixtures.py b/tests/test_quixstreams/test_state/fixtures.py index 9d2302c64..29a2eadda 100644 --- a/tests/test_quixstreams/test_state/fixtures.py +++ b/tests/test_quixstreams/test_state/fixtures.py @@ -1,14 +1,22 @@ -import pytest import uuid - from typing import Optional from unittest.mock import create_autospec +import pytest + from quixstreams.kafka import Consumer from quixstreams.state.recovery import RecoveryPartition, RecoveryManager from quixstreams.state.types import StorePartition +@pytest.fixture() +def recovery_manager_factory(topic_manager_factory): + def factory(consumer: Consumer) -> RecoveryManager: + return RecoveryManager(topic_manager=topic_manager_factory(), consumer=consumer) + + return factory + + @pytest.fixture() def recovery_partition_store_mock(): store = create_autospec(StorePartition)() From 5c98975e478c518df937e44b9a001af1facf1dc5 Mon Sep 17 00:00:00 2001 From: Daniil Gusev Date: Wed, 24 Apr 2024 13:52:11 +0200 Subject: [PATCH 11/28] Add more logs to store transaction --- quixstreams/state/rocksdb/transaction.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/quixstreams/state/rocksdb/transaction.py b/quixstreams/state/rocksdb/transaction.py index 5ac6c2195..48901f819 100644 --- a/quixstreams/state/rocksdb/transaction.py +++ b/quixstreams/state/rocksdb/transaction.py @@ -357,6 +357,12 @@ def _flush_state( int_to_int64_bytes(changelog_offset), meta_cf_handle, ) + logger.debug( + f"Flushing state changes to the disk " + f'path="{self._partition.path}" ' + f"processed_offset={processed_offset} " + f"changelog_offset={changelog_offset}" + ) self._partition.write(self._batch) @_validate_transaction_status(PartitionTransactionStatus.STARTED) From d4ed15cf961c08175062f00387ff8cec8665c653 Mon Sep 17 00:00:00 2001 From: Daniil Gusev Date: Wed, 24 Apr 2024 13:53:08 +0200 Subject: [PATCH 12/28] Log elapsed time of the checkpoint commit --- quixstreams/processing_context.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/quixstreams/processing_context.py b/quixstreams/processing_context.py index 84d049f3a..91b3dcc2a 100644 --- a/quixstreams/processing_context.py +++ b/quixstreams/processing_context.py @@ -1,5 +1,6 @@ import dataclasses import logging +import time from typing import Optional from quixstreams.checkpointing import Checkpoint @@ -70,6 +71,10 @@ def commit_checkpoint(self, force: bool = False): :param force: if `True`, commit the checkpoint before its expiration deadline. """ if not self._checkpoint.empty() and (self._checkpoint.expired() or force): + logger.info(f"Committing a checkpoint force={force}") + start = time.monotonic() self._checkpoint.commit() + elapsed = round(time.monotonic() - start, 2) + logger.info(f"Committed a checkpoint force={force} time_elapsed={elapsed}s") self.init_checkpoint() From a1ff8906392ae43170b510a2710fb6084b1e728b Mon Sep 17 00:00:00 2001 From: Daniil Gusev Date: Wed, 24 Apr 2024 13:53:41 +0200 Subject: [PATCH 13/28] Fix recovery test --- tests/test_quixstreams/test_state/test_recovery.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/test_quixstreams/test_state/test_recovery.py b/tests/test_quixstreams/test_state/test_recovery.py index f701b1f5e..5662dbb1a 100644 --- a/tests/test_quixstreams/test_state/test_recovery.py +++ b/tests/test_quixstreams/test_state/test_recovery.py @@ -2,10 +2,8 @@ import uuid from unittest.mock import patch -from quixstreams.state.recovery import ( - ChangelogProducer, - ConfluentPartition, -) +from quixstreams.state.recovery import ChangelogProducer +from confluent_kafka import TopicPartition as ConfluentPartition from quixstreams.state.recovery import ChangelogProducerFactory from ..utils import ConfluentKafkaMessageStub @@ -26,8 +24,9 @@ def test_needs_recovery_caught_up(self, recovery_partition_store_mock): recovery_partition.store_partition.get_changelog_offset.return_value = 20 assert not recovery_partition_store_mock.needs_recovery - def test_needs_recovery_no_valid_offsets(self, recovery_partition_store_mock): - recovery_partition = recovery_partition_store_mock + def test_needs_recovery_no_valid_offsets(self, recovery_partition_factory): + # Create a RecoveryPartition with the offset ahead of the watermark + recovery_partition = recovery_partition_factory(mocked_changelog_offset=101) recovery_partition.set_watermarks(100, 100) assert not recovery_partition.needs_recovery assert recovery_partition.needs_offset_update From f46d202544e7e789e8316cc8a90b50cb8627f8e4 Mon Sep 17 00:00:00 2001 From: Daniil Gusev Date: Wed, 24 Apr 2024 17:55:18 +0200 Subject: [PATCH 14/28] Don't commit the checkpoint if application fails --- quixstreams/app.py | 48 ++++++++++--------- tests/test_quixstreams/test_app.py | 77 ++++++++++++++++++++++++++---- 2 files changed, 92 insertions(+), 33 deletions(-) diff --git a/quixstreams/app.py b/quixstreams/app.py index 53f4bfe26..5bac65f78 100644 --- a/quixstreams/app.py +++ b/quixstreams/app.py @@ -1,4 +1,5 @@ import contextlib +import enum import functools import logging import os @@ -9,7 +10,6 @@ from confluent_kafka import TopicPartition from typing_extensions import Self -from .checkpoint import Checkpoint from .context import set_message_context, copy_context from .core.stream import Filtered from .dataframe import StreamingDataFrame @@ -54,6 +54,13 @@ MessageProcessedCallback = Callable[[str, int, int], None] +class ApplicationStatus(enum.Enum): + CREATED = 1 + RUNNING = 2 + FAILED = 3 + STOPPED = 4 + + class Application: """ The main Application class. @@ -234,7 +241,6 @@ def __init__( self._commit_interval = commit_interval self._producer_extra_config = producer_extra_config self._consumer_extra_config = consumer_extra_config - self._consumer = RowConsumer( broker_address=broker_address, consumer_group=consumer_group, @@ -253,11 +259,10 @@ def __init__( self._consumer_poll_timeout = consumer_poll_timeout self._producer_poll_timeout = producer_poll_timeout - self._running = False self._on_processing_error = on_processing_error or default_on_processing_error self._on_message_processed = on_message_processed self._auto_create_topics = auto_create_topics - self._do_recovery_check = False + self._status: ApplicationStatus = ApplicationStatus.CREATED if not topic_manager: topic_manager = topic_manager_factory( @@ -281,12 +286,6 @@ def __init__( else None ), ) - self._checkpoint = Checkpoint( - producer=self._producer, - consumer=self._consumer, - state_manager=self._state_manager, - commit_interval=self._commit_interval, - ) self._processing_context = ProcessingContext( commit_interval=self._commit_interval, producer=self._producer, @@ -548,7 +547,7 @@ def dataframe( ) return sdf - def stop(self): + def stop(self, fail: bool = False): """ Stop the internal poll loop and the message processing. @@ -557,8 +556,11 @@ def stop(self): To otherwise stop an application, either send a `SIGTERM` to the process (like Kubernetes does) or perform a typical `KeyboardInterrupt` (`Ctrl+C`). + + :param fail: if True, signals that application is stopped due + to unhandled exception and it shouldn't commit the current checkpoint. """ - self._running = False + self._status = ApplicationStatus.FAILED if fail else ApplicationStatus.STOPPED if self._state_manager.using_changelogs: self._state_manager.stop_recovery() @@ -687,14 +689,11 @@ def run( self._setup_topics() exit_stack = contextlib.ExitStack() - exit_stack.enter_context(self._producer) - exit_stack.enter_context(self._consumer) exit_stack.enter_context(self._state_manager) - - exit_stack.callback( - lambda *_: logger.debug("Closing Kafka consumers & producers") + exit_stack.enter_context(self._consumer) + exit_stack.push( + lambda exc_type, exc_val, exc_tb: self.stop(fail=exc_val is not None) ) - exit_stack.callback(lambda *_: self.stop()) with exit_stack: # Subscribe to topics in Kafka and start polling @@ -706,21 +705,20 @@ def run( ) logger.info("Waiting for incoming messages") # Start polling Kafka for messages and callbacks - self._running = True + self._status = ApplicationStatus.RUNNING # Initialize the checkpoint self._processing_context.init_checkpoint() dataframe_composed = dataframe.compose() - while self._running: + while self._status == ApplicationStatus.RUNNING: if self._state_manager.recovery_required: self._state_manager.do_recovery() else: self._process_message(dataframe_composed) self._processing_context.commit_checkpoint() - self._processing_context.commit_checkpoint(force=True) logger.info("Stop processing of StreamingDataFrame") def _quix_runtime_init(self): @@ -836,8 +834,12 @@ def _on_revoke(self, _, topic_partitions: List[TopicPartition]): """ Revoke partitions from consumer and state """ - # Commit everything processed so far - self._processing_context.commit_checkpoint(force=True) + # Commit everything processed so far unless the application is closing + # because of unhandled exception. + # In this case, we should drop the checkpoint and let another consumer + # pick up from the latest one + if not self._status == ApplicationStatus.FAILED: + self._processing_context.commit_checkpoint(force=True) self._consumer.incremental_unassign(topic_partitions) if self._state_manager.stores: diff --git a/tests/test_quixstreams/test_app.py b/tests/test_quixstreams/test_app.py index 366c4aab1..4411b7151 100644 --- a/tests/test_quixstreams/test_app.py +++ b/tests/test_quixstreams/test_app.py @@ -93,7 +93,7 @@ def test_produce_and_consume(self, app_factory, topic_factory): for msg in consumed_messages: assert msg in messages_to_produce - def test_run_consume_and_produce( + def test_run_success( self, app_factory, row_consumer_factory, @@ -172,6 +172,62 @@ def on_message_processed(topic_, partition, offset): assert row.key == data["key"] assert row.value == {column_name: loads(data["value"].decode())} + def test_run_fails_no_commit( + self, + app_factory, + row_consumer_factory, + executor, + row_factory, + ): + """ + Test that Application doesn't commit the checkpoint in case of failure + """ + + app = app_factory( + auto_offset_reset="earliest", + commit_interval=9999, # Set a high commit interval to ensure no autocommit + ) + + partition_num = 0 + topic_in = app.topic(str(uuid.uuid4())) + + def count_and_fail(_): + # Count the incoming messages and fail on processing the last one + nonlocal processed_count + + processed_count += 1 + # Stop processing after consuming all the messages + if processed_count == total_messages: + failed.set_result(True) + raise ValueError("test") + + sdf = app.dataframe(topic_in).apply(count_and_fail) + + processed_count = 0 + total_messages = 3 + # Produce messages to the topic and flush + data = {"key": b"key", "value": b'"value"', "partition": partition_num} + with app.get_producer() as producer: + for _ in range(total_messages): + producer.produce(topic_in.name, **data) + + failed = Future() + + # Stop app when the future is resolved + executor.submit(_stop_app_on_future, app, failed, 15.0) + with pytest.raises(ValueError): + app.run(sdf) + + # Check that all messages have been processed + assert processed_count == total_messages + + # Ensure the offset is not committed to Kafka + with row_consumer_factory() as row_consumer: + committed, *_ = row_consumer.committed( + [TopicPartition(topic_in.name, partition_num)] + ) + assert committed.offset == -1001 + def test_run_consumer_error_raised(self, app_factory, executor): # Set "auto_offset_reset" to "error" to simulate errors in Consumer app = app_factory(auto_offset_reset="error") @@ -902,7 +958,7 @@ def count(_, state: State): # All keys in state must be prefixed with the message key assert tx.get("total", prefix=message_key) == total_consumed.result() - def test_run_stateful_processing_fails( + def test_run_stateful_fails_no_commit( self, app_factory, executor, @@ -915,23 +971,24 @@ def test_run_stateful_processing_fails( consumer_group=consumer_group, auto_offset_reset="earliest", state_dir=state_dir, + commit_interval=9999, # Set a high commit interval to ensure no autocommit ) topic_in = app.topic(str(uuid.uuid4()), value_deserializer=JSONDeserializer()) # Define a function that counts incoming Rows using state - def count(_, state: State): + def count_and_fail(_, state: State): total = state.get("total", 0) total += 1 state.set("total", total) + # Fail after processing all messages + if total == total_messages: + failed.set_result(True) + raise ValueError("test") failed = Future() - def fail(*_): - failed.set_result(True) - raise ValueError("test") - - sdf = app.dataframe(topic_in).update(count, stateful=True).update(fail) + sdf = app.dataframe(topic_in).update(count_and_fail, stateful=True) total_messages = 3 # Produce messages to the topic and flush @@ -1054,7 +1111,7 @@ def test_on_assign_topic_offset_behind_warning( tx = store.start_partition_transaction(partition_num) # Do some change to probe the Writebatch tx.set("key", "value", prefix=b"__key__") - tx.maybe_flush(offset=9999) + tx.flush(processed_offset=9999) assert state_partitions[partition_num].get_processed_offset() == 9999 # Define some stateful function so the App assigns store partitions @@ -1233,11 +1290,11 @@ def validate_state(): # validate and then delete the state assert processed_count == partition_msg_count validate_state() + app.clear_state() # run the app again and validate the recovered state processed_count = {0: 0, 1: 0} app, sdf, topic = get_app() - app.clear_state() done = Future() executor.submit(_stop_app_on_future, app, done, 10.0) app.run(sdf) From 4a8d2eccce7ed16e5cc8e51ca7c989b0387d494b Mon Sep 17 00:00:00 2001 From: Daniil Gusev Date: Fri, 26 Apr 2024 13:25:59 +0200 Subject: [PATCH 15/28] Add source topic-partition-offset to changelog messages - Move changelog producing code from Partition to StateTransaction - Make "processed_offset" a required param in "prepare()" - Pass the source topic info to the ChangelogProducer and add it to the changelog messages --- quixstreams/state/manager.py | 14 +- quixstreams/state/recovery.py | 27 ++- quixstreams/state/rocksdb/metadata.py | 1 + quixstreams/state/rocksdb/partition.py | 37 +-- quixstreams/state/rocksdb/transaction.py | 214 ++++++++++-------- .../state/rocksdb/windowed/partition.py | 1 + .../state/rocksdb/windowed/transaction.py | 9 +- quixstreams/state/types.py | 25 +- tests/test_quixstreams/test_app.py | 6 +- .../test_state/test_recovery.py | 19 +- .../test_state/test_rocksdb/fixtures.py | 16 +- .../test_rocksdb/test_transaction.py | 61 ++++- .../test_rocksdb/test_windowed/fixtures.py | 6 +- .../test_windowed/test_transaction.py | 50 +++- 14 files changed, 291 insertions(+), 195 deletions(-) diff --git a/quixstreams/state/manager.py b/quixstreams/state/manager.py index 0db79d54c..457a73291 100644 --- a/quixstreams/state/manager.py +++ b/quixstreams/state/manager.py @@ -122,13 +122,15 @@ def _setup_changelogs( f'State Manager: registering changelog for store "{store_name}" ' f'(topic "{topic_name}")' ) + changelog_topic = self._recovery_manager.register_changelog( + topic_name=topic_name, + store_name=store_name, + consumer_group=self._group_id, + ) return ChangelogProducerFactory( - self._recovery_manager.register_changelog( - topic_name=topic_name, - store_name=store_name, - consumer_group=self._group_id, - ).name, - self._producer, + changelog_name=changelog_topic.name, + source_topic_name=topic_name, + producer=self._producer, ) def register_store( diff --git a/quixstreams/state/recovery.py b/quixstreams/state/recovery.py index 11e2bc2c9..ae7cab875 100644 --- a/quixstreams/state/recovery.py +++ b/quixstreams/state/recovery.py @@ -117,7 +117,9 @@ class ChangelogProducerFactory: Generates ChangelogProducers, which produce changelog messages to a StorePartition. """ - def __init__(self, changelog_name: str, producer: RowProducer): + def __init__( + self, changelog_name: str, source_topic_name: str, producer: RowProducer + ): """ :param changelog_name: changelog topic name :param producer: a RowProducer (not shared with `Application` instance) @@ -125,6 +127,7 @@ def __init__(self, changelog_name: str, producer: RowProducer): :return: a ChangelogWriter instance """ self._changelog_name = changelog_name + self._source_topic_name = source_topic_name self._producer = producer def get_partition_producer(self, partition_num) -> "ChangelogProducer": @@ -135,7 +138,10 @@ def get_partition_producer(self, partition_num) -> "ChangelogProducer": :param partition_num: source topic partition number """ return ChangelogProducer( - self._changelog_name, partition_num, producer=self._producer + changelog_name=self._changelog_name, + source_topic_name=self._source_topic_name, + partition=partition_num, + producer=self._producer, ) @@ -145,16 +151,27 @@ class ChangelogProducer: kafka changelog partition. """ - def __init__(self, changelog_name: str, partition: int, producer: RowProducer): + def __init__( + self, + changelog_name: str, + source_topic_name: str, + partition: int, + producer: RowProducer, + ): """ :param changelog_name: A changelog topic name :param partition: source topic partition number :param producer: a RowProducer (not shared with `Application` instance) """ self._changelog_name = changelog_name + self._source_topic_name = source_topic_name self._partition_num = partition self._producer = producer + @property + def source_topic_name(self) -> str: + return self._source_topic_name + @property def changelog_name(self) -> str: return self._changelog_name @@ -184,8 +201,8 @@ def produce( topic=self._changelog_name, ) - def flush(self): - self._producer.flush() + def flush(self, timeout: Optional[float] = None) -> int: + return self._producer.flush(timeout=timeout) class RecoveryManager: diff --git a/quixstreams/state/rocksdb/metadata.py b/quixstreams/state/rocksdb/metadata.py index f7683902b..948469f0d 100644 --- a/quixstreams/state/rocksdb/metadata.py +++ b/quixstreams/state/rocksdb/metadata.py @@ -6,3 +6,4 @@ METADATA_CF_NAME = "__metadata__" CHANGELOG_CF_MESSAGE_HEADER = "__column_family__" +CHANGELOG_PROCESSED_OFFSET_MESSAGE_HEADER = "__processed_tp_offset__" diff --git a/quixstreams/state/rocksdb/partition.py b/quixstreams/state/rocksdb/partition.py index 8aab50316..5246fbde0 100644 --- a/quixstreams/state/rocksdb/partition.py +++ b/quixstreams/state/rocksdb/partition.py @@ -46,7 +46,6 @@ class RocksDBStorePartition(StorePartition): 1. Managing access to the RocksDB instance 2. Creating transactions to interact with data 3. Flushing WriteBatches to the RocksDB - 4. Producing state-related changelog messages It opens the RocksDB on `__init__`. If the db is locked by another process, it will retry according to `open_max_retries` and `open_retry_backoff` options. @@ -73,25 +72,6 @@ def __init__( self._cf_handle_cache: Dict[str, ColumnFamily] = {} self._changelog_producer = changelog_producer - @property - def using_changelogs(self) -> bool: - return bool(self._changelog_producer) - - @property - def changelog_topic_partition(self) -> Optional[Tuple[str, int]]: - """ - Return the changelog topic-partition for the given StorePartition. - - Returns `None` if changelog_producer is not provided. - - :return: (topic, partition) or None - """ - if self._changelog_producer is not None: - return ( - self._changelog_producer.changelog_name, - self._changelog_producer.partition, - ) - def begin( self, ) -> RocksDBPartitionTransaction: @@ -105,6 +85,7 @@ def begin( partition=self, dumps=self._dumps, loads=self._loads, + changelog_producer=self._changelog_producer, ) def _changelog_recover_flush(self, changelog_offset: int, batch: WriteBatch): @@ -152,17 +133,6 @@ def set_changelog_offset(self, changelog_offset: int): """ self._changelog_recover_flush(changelog_offset, WriteBatch(raw_mode=True)) - def produce_to_changelog( - self, - key: bytes, - value: Optional[bytes] = None, - headers: Optional[MessageHeadersMapping] = None, - ): - """ - Produce a message to the StorePartitions respective changelog. - """ - self._changelog_producer.produce(key=key, value=value, headers=headers) - def write(self, batch: WriteBatch): """ Write `WriteBatch` to RocksDB @@ -227,8 +197,6 @@ def close(self): self._cf_handle_cache = {} self._cf_cache = {} self._db.close() - if self._changelog_producer: - self._changelog_producer.flush() logger.debug(f'Closed rocksdb partition on "{self._path}"') @property @@ -327,9 +295,6 @@ def _open_rocksdict(self) -> Rdict: options=options, access_type=AccessType.read_write(), ) - # write_opts = WriteOptions() - # write_opts.disable_wal = True - # rdict.set_write_options(write_opts) # Ensure metadata column family is created without defining it upfront try: rdict.get_column_family(METADATA_CF_NAME) diff --git a/quixstreams/state/rocksdb/transaction.py b/quixstreams/state/rocksdb/transaction.py index 48901f819..499795101 100644 --- a/quixstreams/state/rocksdb/transaction.py +++ b/quixstreams/state/rocksdb/transaction.py @@ -3,7 +3,8 @@ from typing import Any, Union, Optional, Dict, NewType, TYPE_CHECKING, Tuple from rocksdict import WriteBatch - +from quixstreams.utils.json import dumps as json_dumps +from quixstreams.state.recovery import ChangelogProducer from quixstreams.state.types import ( DumpsFunc, LoadsFunc, @@ -17,6 +18,7 @@ CHANGELOG_OFFSET_KEY, PREFIX_SEPARATOR, CHANGELOG_CF_MESSAGE_HEADER, + CHANGELOG_PROCESSED_OFFSET_MESSAGE_HEADER, ) from .serialization import serialize, deserialize, int_to_int64_bytes from ..state import TransactionState @@ -100,6 +102,7 @@ def __init__( partition: "RocksDBStorePartition", dumps: DumpsFunc, loads: LoadsFunc, + changelog_producer: Optional[ChangelogProducer] = None, ): """ :param partition: instance of `RocksDBStatePartition` to be used for accessing @@ -115,26 +118,7 @@ def __init__( self._dumps = dumps self._loads = loads self._status = PartitionTransactionStatus.STARTED - - def as_state(self, prefix: Any = DEFAULT_PREFIX) -> TransactionState: - """ - Create a one-time use `TransactionState` object with a limited CRUD interface - to be provided to `StreamingDataFrame` operations. - - The `TransactionState` will prefix all the keys with the supplied `prefix` - for all underlying operations. - - :param prefix: a prefix to be used for all keys - :return: an instance of `TransactionState` - """ - return TransactionState( - transaction=self, - prefix=( - prefix - if isinstance(prefix, bytes) - else serialize(prefix, dumps=self._dumps) - ), - ) + self._changelog_producer = changelog_producer @_validate_transaction_status(PartitionTransactionStatus.STARTED) def get( @@ -247,6 +231,61 @@ def exists(self, key: Any, prefix: bytes, cf_name: str = "default") -> bool: return self._partition.exists(key_serialized, cf_name=cf_name) + @_validate_transaction_status(PartitionTransactionStatus.STARTED) + def prepare(self, processed_offset: int): + """ + Produce changelog messages to the changelog topic for all changes accumulated + in this transaction and prepare transcation to flush its state to the state + store. + + After successful `prepare()`, the transaction status is changed to PREPARED, + and it cannot receive updates anymore. + + If changelog is disabled for this application, no updates will be produced + to the changelog topic. + + :param processed_offset: the offset of the latest processed message + """ + try: + self._produce_changelog(processed_offset=processed_offset) + self._status = PartitionTransactionStatus.PREPARED + except Exception: + self._status = PartitionTransactionStatus.FAILED + raise + + @_validate_transaction_status( + PartitionTransactionStatus.STARTED, PartitionTransactionStatus.PREPARED + ) + def flush( + self, + processed_offset: Optional[int] = None, + changelog_offset: Optional[int] = None, + ): + """ + Flush the recent updates to the database. + It writes the WriteBatch to RocksDB and marks itself as finished. + + If writing fails, the transaction is marked as failed and + cannot be used anymore. + + >***NOTE:*** If no keys have been modified during the transaction + (i.e. no "set" or "delete" have been called at least once), it will + not flush ANY data to the database including the offset to optimize + I/O. + + :param processed_offset: offset of the last processed message, optional. + :param changelog_offset: offset of the last produced changelog message, + optional. + """ + try: + self._flush_state( + processed_offset=processed_offset, changelog_offset=changelog_offset + ) + self._status = PartitionTransactionStatus.COMPLETE + except Exception: + self._status = PartitionTransactionStatus.FAILED + raise + @property def status(self) -> PartitionTransactionStatus: return self._status @@ -289,25 +328,70 @@ def failed(self) -> bool: """ return self._status == PartitionTransactionStatus.FAILED + @property + def changelog_topic_partition(self) -> Optional[Tuple[str, int]]: + """ + Return the changelog topic-partition for the StorePartition of this transaction. + + Returns `None` if changelog_producer is not provided. + + :return: (topic, partition) or None + """ + if self._changelog_producer is not None: + return ( + self._changelog_producer.changelog_name, + self._changelog_producer.partition, + ) + + def as_state(self, prefix: Any = DEFAULT_PREFIX) -> TransactionState: + """ + Create a one-time use `TransactionState` object with a limited CRUD interface + to be provided to `StreamingDataFrame` operations. + + The `TransactionState` will prefix all the keys with the supplied `prefix` + for all underlying operations. + + :param prefix: a prefix to be used for all keys + :return: an instance of `TransactionState` + """ + return TransactionState( + transaction=self, + prefix=( + prefix + if isinstance(prefix, bytes) + else serialize(prefix, dumps=self._dumps) + ), + ) + def _produce_changelog(self, processed_offset: Optional[int] = None): - if not self._partition.using_changelogs: + changelog_producer = self._changelog_producer + if changelog_producer is None: return - # TODO: Add topic offset to the changelog headers - changelog_topic, changelog_partition = self._partition.changelog_topic_partition + source_topic, changelog_topic, partition = ( + changelog_producer.source_topic_name, + changelog_producer.changelog_name, + changelog_producer.partition, + ) logger.debug( f"Flushing state changes to the changelog topic " f'topic_name="{changelog_topic}" ' - f"partition={changelog_partition} " + f"partition={partition} " f"processed_offset={processed_offset}" ) # Iterate over the transaction update cache for cf_name, cf_update_cache in self._update_cache.items(): - headers = {CHANGELOG_CF_MESSAGE_HEADER: cf_name} + source_tp_offset_header = json_dumps( + [source_topic, partition, processed_offset] + ) + headers = { + CHANGELOG_CF_MESSAGE_HEADER: cf_name, + CHANGELOG_PROCESSED_OFFSET_MESSAGE_HEADER: source_tp_offset_header, + } for _prefix, prefix_update_cache in cf_update_cache.items(): for key, value in prefix_update_cache.items(): # Produce changes to the changelog topic - self._partition.produce_to_changelog( + self._changelog_producer.produce( key=key, value=value if value is not DELETED else None, headers=headers, @@ -365,72 +449,6 @@ def _flush_state( ) self._partition.write(self._batch) - @_validate_transaction_status(PartitionTransactionStatus.STARTED) - def prepare(self, processed_offset: Optional[int] = None): - """ - Produce changelog messages to the changelog topic for all changes accumulated - in this transaction and prepare transcation to flush its state to the state - store. - - After successful `prepare()`, the transaction status is changed to PREPARED, - and it cannot receive updates anymore. - - If changelog is disabled for this application, no updates will be produced - to the changelog topic. - - :param processed_offset: the offset of the latest processed message - """ - try: - self._produce_changelog(processed_offset=processed_offset) - self._status = PartitionTransactionStatus.PREPARED - except Exception: - self._status = PartitionTransactionStatus.FAILED - raise - - @_validate_transaction_status( - PartitionTransactionStatus.STARTED, PartitionTransactionStatus.PREPARED - ) - def flush( - self, - processed_offset: Optional[int] = None, - changelog_offset: Optional[int] = None, - ): - """ - Flush the recent updates to the database. - It writes the WriteBatch to RocksDB and marks itself as finished. - - If writing fails, the transaction is marked as failed and - cannot be used anymore. - - >***NOTE:*** If no keys have been modified during the transaction - (i.e. no "set" or "delete" have been called at least once), it will - not flush ANY data to the database including the offset to optimize - I/O. - - :param processed_offset: offset of the last processed message, optional. - :param changelog_offset: offset of the last produced changelog message, - optional. - """ - try: - self._flush_state( - processed_offset=processed_offset, changelog_offset=changelog_offset - ) - self._status = PartitionTransactionStatus.COMPLETE - except Exception: - self._status = PartitionTransactionStatus.FAILED - raise - - @property - def changelog_topic_partition(self) -> Optional[Tuple[str, int]]: - """ - Return the changelog topic-partition for the StorePartition of this transaction. - - Returns `None` if changelog_producer is not provided. - - :return: (topic, partition) or None - """ - return self._partition.changelog_topic_partition - def _serialize_value(self, value: Any) -> bytes: return serialize(value, dumps=self._dumps) @@ -446,7 +464,13 @@ def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): - # Note: with state transactions, context managers are meant to be used mostly - # in tests + """ + Note: with state transactions, context manager interface is meant + to be used mostly in unit tests. + + Normally, the Checkpoint class is responsible for managing and flushing + the transactions. + """ + if exc_val is None and not self.failed: self.flush() diff --git a/quixstreams/state/rocksdb/windowed/partition.py b/quixstreams/state/rocksdb/windowed/partition.py index ffa22826e..579292e88 100644 --- a/quixstreams/state/rocksdb/windowed/partition.py +++ b/quixstreams/state/rocksdb/windowed/partition.py @@ -57,6 +57,7 @@ def begin(self) -> "WindowedRocksDBPartitionTransaction": dumps=self._dumps, loads=self._loads, latest_timestamp_ms=self._latest_timestamp_ms, + changelog_producer=self._changelog_producer, ) def set_latest_timestamp(self, timestamp_ms: int): diff --git a/quixstreams/state/rocksdb/windowed/transaction.py b/quixstreams/state/rocksdb/windowed/transaction.py index d392cf71e..445332a86 100644 --- a/quixstreams/state/rocksdb/windowed/transaction.py +++ b/quixstreams/state/rocksdb/windowed/transaction.py @@ -2,6 +2,7 @@ from rocksdict import ReadOptions +from quixstreams.state.recovery import ChangelogProducer from .metadata import LATEST_EXPIRED_WINDOW_TIMESTAMP_KEY, LATEST_EXPIRED_WINDOW_CF_NAME from .serialization import encode_window_key, encode_window_prefix, parse_window_key from .state import WindowedTransactionState @@ -27,8 +28,14 @@ def __init__( dumps: DumpsFunc, loads: LoadsFunc, latest_timestamp_ms: int, + changelog_producer: Optional[ChangelogProducer] = None, ): - super().__init__(partition=partition, dumps=dumps, loads=loads) + super().__init__( + partition=partition, + dumps=dumps, + loads=loads, + changelog_producer=changelog_producer, + ) self._partition = cast("WindowedRocksDBStorePartition", self._partition) self._latest_timestamp_ms = latest_timestamp_ms diff --git a/quixstreams/state/types.py b/quixstreams/state/types.py index 5c63ad3d5..19fde2524 100644 --- a/quixstreams/state/types.py +++ b/quixstreams/state/types.py @@ -107,17 +107,6 @@ def recover_from_changelog_message( """ ... - def produce_to_changelog( - self, - key: bytes, - value: Optional[bytes] = None, - headers: Optional[MessageHeadersMapping] = None, - ): - """ - Produce a message to the StorePartitions respective changelog. - """ - ... - def get_processed_offset(self) -> Optional[int]: """ Get last processed offset for the given partition @@ -270,7 +259,7 @@ def prepared(self) -> bool: """ ... - def prepare(self, processed_offset: Optional[int] = None): + def prepare(self, processed_offset: int): """ Produce changelog messages to the changelog topic for all changes accumulated in this transaction and prepare transcation to flush its state to the state @@ -404,7 +393,7 @@ def prepared(self) -> bool: """ ... - def prepare(self, processed_offset: Optional[int] = None): + def prepare(self, processed_offset: int): """ Produce changelog messages to the changelog topic for all changes accumulated in this transaction and prepare transcation to flush its state to the state @@ -496,6 +485,16 @@ def flush( optional. """ + @property + def changelog_topic_partition(self) -> Optional[Tuple[str, int]]: + """ + Return the changelog topic-partition for the StorePartition of this transaction. + + Returns `None` if changelog_producer is not provided. + + :return: (topic, partition) or None + """ + def __enter__(self): ... def __exit__(self, exc_type, exc_val, exc_tb): ... diff --git a/tests/test_quixstreams/test_app.py b/tests/test_quixstreams/test_app.py index 4411b7151..639bbced4 100644 --- a/tests/test_quixstreams/test_app.py +++ b/tests/test_quixstreams/test_app.py @@ -21,9 +21,7 @@ JSONSerializer, TopicConfig, ) -from quixstreams.platforms.quix import ( - QuixKafkaConfigsBuilder, -) +from quixstreams.platforms.quix import QuixKafkaConfigsBuilder from quixstreams.platforms.quix.env import QuixEnvironment from quixstreams.rowconsumer import RowConsumer from quixstreams.state import State @@ -198,7 +196,6 @@ def count_and_fail(_): processed_count += 1 # Stop processing after consuming all the messages if processed_count == total_messages: - failed.set_result(True) raise ValueError("test") sdf = app.dataframe(topic_in).apply(count_and_fail) @@ -983,7 +980,6 @@ def count_and_fail(_, state: State): state.set("total", total) # Fail after processing all messages if total == total_messages: - failed.set_result(True) raise ValueError("test") failed = Future() diff --git a/tests/test_quixstreams/test_state/test_recovery.py b/tests/test_quixstreams/test_state/test_recovery.py index 5662dbb1a..dab1d51dc 100644 --- a/tests/test_quixstreams/test_state/test_recovery.py +++ b/tests/test_quixstreams/test_state/test_recovery.py @@ -79,18 +79,20 @@ def test_produce( value_serializer="bytes", config=topic_manager.topic_config(num_partitions=3), ) + source_topic_name = "source-topic" topic_manager.create_topics([changelog]) - writer = ChangelogProducer( + producer = ChangelogProducer( changelog_name=changelog.name, partition=p_num, + source_topic_name=source_topic_name, producer=row_producer_factory(), ) - writer.produce( + producer.produce( **{k: v for k, v in expected.items() if k in ["key", "value"]}, headers={cf_header: cf}, ) - writer._producer.flush(5) + producer.flush() consumer = consumer_factory(auto_offset_reset="earliest") consumer.subscribe([changelog.name]) @@ -103,16 +105,19 @@ def test_produce( class TestChangelogProducerFactory: def test_get_partition_producer(self, row_producer_factory): changelog_name = "changelog__topic" + source_topic_name = "source-topic" producer = row_producer_factory() p_num = 1 changelog_producer = ChangelogProducerFactory( - changelog_name=changelog_name, producer=producer + changelog_name=changelog_name, + producer=producer, + source_topic_name=source_topic_name, ).get_partition_producer(partition_num=p_num) - assert changelog_producer._changelog_name == changelog_name - assert changelog_producer._partition_num == p_num - assert changelog_producer._producer == producer + assert changelog_producer.changelog_name == changelog_name + assert changelog_producer.partition == p_num + assert changelog_producer.source_topic_name == source_topic_name class TestRecoveryManager: diff --git a/tests/test_quixstreams/test_state/test_rocksdb/fixtures.py b/tests/test_quixstreams/test_state/test_rocksdb/fixtures.py index 35f4e464f..b2176a560 100644 --- a/tests/test_quixstreams/test_state/test_rocksdb/fixtures.py +++ b/tests/test_quixstreams/test_state/test_rocksdb/fixtures.py @@ -1,6 +1,6 @@ import uuid from typing import Optional -from unittest.mock import create_autospec, MagicMock +from unittest.mock import MagicMock, PropertyMock import pytest @@ -11,7 +11,7 @@ @pytest.fixture() -def rocksdb_partition_factory(tmp_path): +def rocksdb_partition_factory(tmp_path, changelog_producer_mock): def factory( name: str = "db", options: Optional[RocksDBOptions] = None, @@ -19,13 +19,9 @@ def factory( ) -> RocksDBStorePartition: path = (tmp_path / name).as_posix() _options = options or RocksDBOptions(open_max_retries=0, open_retry_backoff=3.0) - if not changelog_producer: - changelog_producer = create_autospec(ChangelogProducer)( - "topic", "partition", "producer" - ) return RocksDBStorePartition( path, - changelog_producer=changelog_producer, + changelog_producer=changelog_producer or changelog_producer_mock, options=_options, ) @@ -66,4 +62,8 @@ def rocksdb_store(rocksdb_store_factory) -> RocksDBStore: @pytest.fixture() def changelog_producer_mock(): - return MagicMock(spec_set=ChangelogProducer) + producer = MagicMock(spec_set=ChangelogProducer) + type(producer).source_topic_name = PropertyMock(return_value="test-source-topic") + type(producer).changelog_name = PropertyMock(return_value="test-changelog-topic") + type(producer).partition = PropertyMock(return_value=0) + return producer diff --git a/tests/test_quixstreams/test_state/test_rocksdb/test_transaction.py b/tests/test_quixstreams/test_state/test_rocksdb/test_transaction.py index 01aad679b..d9adcb9c7 100644 --- a/tests/test_quixstreams/test_state/test_rocksdb/test_transaction.py +++ b/tests/test_quixstreams/test_state/test_rocksdb/test_transaction.py @@ -14,7 +14,10 @@ RocksDBPartitionTransaction, InvalidChangelogOffset, ) -from quixstreams.state.rocksdb.metadata import CHANGELOG_CF_MESSAGE_HEADER +from quixstreams.state.rocksdb.metadata import ( + CHANGELOG_CF_MESSAGE_HEADER, + CHANGELOG_PROCESSED_OFFSET_MESSAGE_HEADER, +) from quixstreams.state.rocksdb.serialization import serialize from quixstreams.utils.json import dumps @@ -284,7 +287,7 @@ def test_update_key_prepared_transaction_fails(self, rocksdb_partition): tx = rocksdb_partition.begin() tx.set(key="key", value="value", prefix=prefix) - tx.prepare() + tx.prepare(processed_offset=1) assert tx.prepared with pytest.raises(StateTransactionError): @@ -445,6 +448,11 @@ def test_set_and_prepare(self, rocksdb_partition_factory, changelog_producer_moc ] cf = "default" prefix = b"__key__" + source_topic_name, source_partition = ( + changelog_producer_mock.source_topic_name, + changelog_producer_mock.partition, + ) + processed_offset = 1 with rocksdb_partition_factory( changelog_producer=changelog_producer_mock @@ -457,15 +465,25 @@ def test_set_and_prepare(self, rocksdb_partition_factory, changelog_producer_moc cf_name=cf, prefix=prefix, ) - tx.prepare() + tx.prepare(processed_offset=processed_offset) assert changelog_producer_mock.produce.call_count == len(data) + for (key, value), call in zip( data, changelog_producer_mock.produce.call_args_list ): assert call.kwargs["key"] == tx._serialize_key(key=key, prefix=prefix) assert call.kwargs["value"] == tx._serialize_value(value=value) - assert call.kwargs["headers"] == {CHANGELOG_CF_MESSAGE_HEADER: cf} + assert call.kwargs["headers"] == { + CHANGELOG_CF_MESSAGE_HEADER: cf, + CHANGELOG_PROCESSED_OFFSET_MESSAGE_HEADER: dumps( + [ + source_topic_name, + source_partition, + processed_offset, + ] + ), + } assert tx.prepared @@ -475,6 +493,12 @@ def test_delete_and_prepare( key, value = "key", "value" cf = "default" prefix = b"__key__" + source_topic_name, source_partition = ( + changelog_producer_mock.source_topic_name, + changelog_producer_mock.partition, + ) + processed_offset = 1 + with rocksdb_partition_factory( changelog_producer=changelog_producer_mock ) as partition: @@ -482,7 +506,7 @@ def test_delete_and_prepare( tx = partition.begin() tx.delete(key=key, cf_name=cf, prefix=prefix) - tx.prepare() + tx.prepare(processed_offset=processed_offset) assert tx.prepared assert changelog_producer_mock.produce.call_count == 1 @@ -492,7 +516,16 @@ def test_delete_and_prepare( key=key, prefix=prefix ) assert delete_changelog.kwargs["value"] is None - assert delete_changelog.kwargs["headers"] == {CHANGELOG_CF_MESSAGE_HEADER: cf} + assert delete_changelog.kwargs["headers"] == { + CHANGELOG_CF_MESSAGE_HEADER: cf, + CHANGELOG_PROCESSED_OFFSET_MESSAGE_HEADER: dumps( + [ + source_topic_name, + source_partition, + processed_offset, + ] + ), + } def test_set_delete_and_prepare( self, rocksdb_partition_factory, changelog_producer_mock @@ -504,6 +537,11 @@ def test_set_delete_and_prepare( key, value = "key", "value" cf = "default" prefix = b"__key__" + source_topic_name, source_partition = ( + changelog_producer_mock.source_topic_name, + changelog_producer_mock.partition, + ) + processed_offset = 1 with rocksdb_partition_factory( changelog_producer=changelog_producer_mock @@ -512,7 +550,7 @@ def test_set_delete_and_prepare( tx.set(key=key, value=value, cf_name=cf, prefix=prefix) tx.delete(key=key, cf_name=cf, prefix=prefix) - tx.prepare() + tx.prepare(processed_offset=processed_offset) assert tx.prepared assert changelog_producer_mock.produce.call_count == 1 @@ -522,5 +560,12 @@ def test_set_delete_and_prepare( ) assert delete_changelog.kwargs["value"] is None assert delete_changelog.kwargs["headers"] == { - CHANGELOG_CF_MESSAGE_HEADER: cf + CHANGELOG_CF_MESSAGE_HEADER: cf, + CHANGELOG_PROCESSED_OFFSET_MESSAGE_HEADER: dumps( + [ + source_topic_name, + source_partition, + processed_offset, + ] + ), } diff --git a/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/fixtures.py b/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/fixtures.py index 0fc5618dd..b60884185 100644 --- a/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/fixtures.py +++ b/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/fixtures.py @@ -50,19 +50,21 @@ def factory( @pytest.fixture() -def windowed_rocksdb_store_factory_changelog(tmp_path): +def windowed_rocksdb_store_factory_changelog(tmp_path, changelog_producer_mock): def factory( topic: Optional[str] = None, changelog: Optional[str] = None, name: str = "default", producer: Optional[RowProducer] = None, ) -> WindowedRocksDBStore: + topic = topic or str(uuid.uuid4()) return WindowedRocksDBStore( - topic=topic or str(uuid.uuid4()), + topic=topic, name=name, base_dir=str(tmp_path), changelog_producer_factory=ChangelogProducerFactory( changelog_name=changelog or str(uuid.uuid4()), + source_topic_name=topic, producer=producer or create_autospec(RowProducer)("address"), ), ) diff --git a/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/test_transaction.py b/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/test_transaction.py index 2b07a0539..83d6ca3a2 100644 --- a/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/test_transaction.py +++ b/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/test_transaction.py @@ -1,7 +1,11 @@ import pytest -from quixstreams.state.rocksdb.metadata import CHANGELOG_CF_MESSAGE_HEADER +from quixstreams.state.rocksdb.metadata import ( + CHANGELOG_CF_MESSAGE_HEADER, + CHANGELOG_PROCESSED_OFFSET_MESSAGE_HEADER, +) from quixstreams.state.rocksdb.windowed.serialization import encode_window_key +from quixstreams.utils.json import dumps class TestWindowedRocksDBPartitionTransaction: @@ -317,6 +321,11 @@ def test_update_window_and_prepare( start_ms = 0 end_ms = 10 value = 1 + source_topic_name, source_partition = ( + changelog_producer_mock.source_topic_name, + changelog_producer_mock.partition, + ) + processed_offset = 1 with windowed_rocksdb_partition_factory( changelog_producer=changelog_producer_mock @@ -329,7 +338,7 @@ def test_update_window_and_prepare( timestamp_ms=2, prefix=prefix, ) - tx.prepare() + tx.prepare(processed_offset=processed_offset) assert tx.prepared assert changelog_producer_mock.produce.call_count == 1 @@ -340,23 +349,37 @@ def test_update_window_and_prepare( changelog_producer_mock.produce.assert_called_with( key=expected_produced_key, value=expected_produced_value, - headers={CHANGELOG_CF_MESSAGE_HEADER: "default"}, + headers={ + CHANGELOG_CF_MESSAGE_HEADER: "default", + CHANGELOG_PROCESSED_OFFSET_MESSAGE_HEADER: dumps( + [ + source_topic_name, + source_partition, + processed_offset, + ] + ), + }, ) def test_delete_window_and_prepare( self, windowed_rocksdb_partition_factory, changelog_producer_mock ): + prefix = b"__key__" + start_ms = 0 + end_ms = 10 + source_topic_name, source_partition = ( + changelog_producer_mock.source_topic_name, + changelog_producer_mock.partition, + ) + processed_offset = 1 + with windowed_rocksdb_partition_factory( changelog_producer=changelog_producer_mock ) as store_partition: - prefix = b"__key__" - start_ms = 0 - end_ms = 10 - tx = store_partition.begin() tx.delete_window(start_ms=start_ms, end_ms=end_ms, prefix=prefix) - tx.prepare() + tx.prepare(processed_offset=processed_offset) assert tx.prepared assert changelog_producer_mock.produce.call_count == 1 @@ -366,5 +389,14 @@ def test_delete_window_and_prepare( changelog_producer_mock.produce.assert_called_with( key=expected_produced_key, value=None, - headers={CHANGELOG_CF_MESSAGE_HEADER: "default"}, + headers={ + CHANGELOG_CF_MESSAGE_HEADER: "default", + CHANGELOG_PROCESSED_OFFSET_MESSAGE_HEADER: dumps( + [ + source_topic_name, + source_partition, + processed_offset, + ] + ), + }, ) From cf6236cbf6fda382099fadf3ea7d35feffbb4e10 Mon Sep 17 00:00:00 2001 From: Daniil Gusev Date: Mon, 29 Apr 2024 19:21:53 +0200 Subject: [PATCH 16/28] Pass latest committed offset to the store partition for recovery, refactor tests --- quixstreams/app.py | 37 +- quixstreams/state/manager.py | 38 +- quixstreams/state/recovery.py | 119 ++-- quixstreams/state/rocksdb/partition.py | 24 +- quixstreams/state/types.py | 3 +- tests/test_quixstreams/fixtures.py | 15 - tests/test_quixstreams/test_app.py | 17 +- .../test_dataframe/test_dataframe.py | 25 +- .../test_quixstreams/test_models/fixtures.py | 2 +- .../test_models/test_topics/test_topics.py | 2 +- tests/test_quixstreams/test_state/fixtures.py | 49 +- .../test_state/test_manager.py | 177 +++--- .../test_state/test_recovery.py | 550 ------------------ .../test_state/test_recovery/__init__.py | 0 .../test_recovery/test_changelog_producer.py | 64 ++ .../test_recovery/test_recovery_manager.py | 375 ++++++++++++ .../test_recovery/test_recovery_partition.py | 68 +++ .../test_state/test_rocksdb/test_partition.py | 10 +- .../test_windowed/test_partition.py | 10 +- tests/test_quixstreams/utils.py | 63 -- tests/utils.py | 64 +- 21 files changed, 828 insertions(+), 884 deletions(-) delete mode 100644 tests/test_quixstreams/test_state/test_recovery.py create mode 100644 tests/test_quixstreams/test_state/test_recovery/__init__.py create mode 100644 tests/test_quixstreams/test_state/test_recovery/test_changelog_producer.py create mode 100644 tests/test_quixstreams/test_state/test_recovery/test_recovery_manager.py create mode 100644 tests/test_quixstreams/test_state/test_recovery/test_recovery_partition.py delete mode 100644 tests/test_quixstreams/utils.py diff --git a/quixstreams/app.py b/quixstreams/app.py index 5bac65f78..ec6b875a3 100644 --- a/quixstreams/app.py +++ b/quixstreams/app.py @@ -805,8 +805,14 @@ def _on_assign(self, _, topic_partitions: List[TopicPartition]): if self._state_manager.stores: logger.debug(f"Rebalancing: assigning state store partitions") for tp in topic_partitions: + # Get the latest committed offset for the assgined topic partition + tp_committed = self._consumer.committed([tp], timeout=30)[0] # Assign store partitions - store_partitions = self._state_manager.on_partition_assign(tp) + store_partitions = self._state_manager.on_partition_assign( + topic=tp.topic, + partition=tp.partition, + committed_offset=tp_committed.offset, + ) # Check if the latest committed offset >= stored offset # Otherwise, the re-processed messages might use already updated @@ -819,16 +825,17 @@ def _on_assign(self, _, topic_partitions: List[TopicPartition]): if offset is not None ] min_stored_offset = min(stored_offsets) + 1 if stored_offsets else None - if min_stored_offset is not None: - tp_committed = self._consumer.committed([tp], timeout=30)[0] - if min_stored_offset > tp_committed.offset: - logger.warning( - f'Warning: offset "{tp_committed.offset}" ' - f"for topic partition " - f'"{tp_committed.topic}[{tp_committed.partition}]" ' - f'is behind the stored offset "{min_stored_offset}". ' - f"It may lead to distortions in produced data." - ) + if ( + min_stored_offset is not None + and min_stored_offset > tp_committed.offset + ): + logger.warning( + f'Warning: offset "{tp_committed.offset}" ' + f"for topic partition " + f'"{tp_committed.topic}[{tp_committed.partition}]" ' + f'is behind the stored offset "{min_stored_offset}". ' + f"It may lead to distortions in produced data." + ) def _on_revoke(self, _, topic_partitions: List[TopicPartition]): """ @@ -845,7 +852,9 @@ def _on_revoke(self, _, topic_partitions: List[TopicPartition]): if self._state_manager.stores: logger.debug(f"Rebalancing: revoking state store partitions") for tp in topic_partitions: - self._state_manager.on_partition_revoke(tp) + self._state_manager.on_partition_revoke( + topic=tp.topic, partition=tp.partition + ) def _on_lost(self, _, topic_partitions: List[TopicPartition]): """ @@ -854,7 +863,9 @@ def _on_lost(self, _, topic_partitions: List[TopicPartition]): if self._state_manager.stores: logger.debug(f"Rebalancing: dropping lost state store partitions") for tp in topic_partitions: - self._state_manager.on_partition_lost(tp) + self._state_manager.on_partition_revoke( + topic=tp.topic, partition=tp.partition + ) def _setup_signal_handlers(self): signal.signal(signal.SIGINT, self._on_sigint) diff --git a/quixstreams/state/manager.py b/quixstreams/state/manager.py index 457a73291..23770bc72 100644 --- a/quixstreams/state/manager.py +++ b/quixstreams/state/manager.py @@ -4,7 +4,6 @@ from typing import List, Dict, Optional from quixstreams.rowproducer import RowProducer -from quixstreams.types import TopicPartition from .exceptions import ( StoreNotRegisteredError, PartitionStoreIsUsed, @@ -196,45 +195,44 @@ def clear_stores(self): shutil.rmtree(self._state_dir) - def on_partition_assign(self, tp: TopicPartition) -> List[StorePartition]: + def on_partition_assign( + self, topic: str, partition: int, committed_offset: int + ) -> List[StorePartition]: """ Assign store partitions for each registered store for the given `TopicPartition` and return a list of assigned `StorePartition` objects. - :param tp: `TopicPartition` from Kafka consumer + :param topic: Kafka topic name + :param partition: Kafka topic partition + :param committed_offset: latest committed offset for the partition :return: list of assigned `StorePartition` """ store_partitions = {} - for name, store in self._stores.get(tp.topic, {}).items(): - store_partition = store.assign_partition(tp.partition) + for name, store in self._stores.get(topic, {}).items(): + store_partition = store.assign_partition(partition) store_partitions[name] = store_partition if self._recovery_manager and store_partitions: self._recovery_manager.assign_partition( - tp.topic, tp.partition, store_partitions + topic=topic, + partition=partition, + committed_offset=committed_offset, + store_partitions=store_partitions, ) return list(store_partitions.values()) - def on_partition_revoke(self, tp: TopicPartition): + def on_partition_revoke(self, topic: str, partition: int): """ Revoke store partitions for each registered store for the given `TopicPartition` - :param tp: `TopicPartition` from Kafka consumer + :param topic: Kafka topic name + :param partition: Kafka topic partition """ - if stores := self._stores.get(tp.topic, {}).values(): + if stores := self._stores.get(topic, {}).values(): if self._recovery_manager: - self._recovery_manager.revoke_partition(tp.partition) + self._recovery_manager.revoke_partition(partition_num=partition) for store in stores: - store.revoke_partition(tp.partition) - - def on_partition_lost(self, tp: TopicPartition): - """ - Revoke and close store partitions for each registered store for the given - `TopicPartition` - - :param tp: `TopicPartition` from Kafka consumer - """ - self.on_partition_revoke(tp) + store.revoke_partition(partition=partition) def init(self): """ diff --git a/quixstreams/state/recovery.py b/quixstreams/state/recovery.py index ae7cab875..9e1899224 100644 --- a/quixstreams/state/recovery.py +++ b/quixstreams/state/recovery.py @@ -13,7 +13,6 @@ logger = logging.getLogger(__name__) - __all__ = ( "ChangelogProducer", "ChangelogProducerFactory", @@ -35,12 +34,30 @@ def __init__( changelog_name: str, partition_num: int, store_partition: StorePartition, + committed_offset: int, ): - self.changelog_name = changelog_name - self.partition_num = partition_num - self.store_partition = store_partition + self._changelog_name = changelog_name + self._partition_num = partition_num + self._store_partition = store_partition self._changelog_lowwater: Optional[int] = None self._changelog_highwater: Optional[int] = None + self._committed_offset = committed_offset + + @property + def changelog_name(self) -> str: + return self._changelog_name + + @property + def partition_num(self) -> int: + return self._partition_num + + @property + def changelog_highwater(self) -> Optional[int]: + return self._changelog_highwater + + @property + def changelog_lowwater(self) -> Optional[int]: + return self._changelog_lowwater @property def offset(self) -> int: @@ -49,7 +66,7 @@ def offset(self) -> int: :return: changelog offset (int) """ - return self.store_partition.get_changelog_offset() or 0 + return self._store_partition.get_changelog_offset() or 0 @property def needs_recovery(self): @@ -85,7 +102,7 @@ def update_offset(self): f"network issues. State may be inaccurate for any affected keys. " f"The offset will now be set to {self._changelog_highwater}." ) - self.store_partition.set_changelog_offset( + self._store_partition.set_changelog_offset( changelog_offset=self._changelog_highwater - 1 ) @@ -97,8 +114,8 @@ def recover_from_changelog_message( :param changelog_message: A confluent kafka message (everything as bytes) """ - self.store_partition.recover_from_changelog_message( - changelog_message=changelog_message + self._store_partition.recover_from_changelog_message( + changelog_message=changelog_message, committed_offset=self._committed_offset ) def set_watermarks(self, lowwater: int, highwater: int): @@ -165,7 +182,7 @@ def __init__( """ self._changelog_name = changelog_name self._source_topic_name = source_topic_name - self._partition_num = partition + self._partition = partition self._producer = producer @property @@ -178,7 +195,7 @@ def changelog_name(self) -> str: @property def partition(self) -> int: - return self._partition_num + return self._partition def produce( self, @@ -197,7 +214,7 @@ def produce( key=key, value=value, headers=headers, - partition=self._partition_num, + partition=self._partition, topic=self._changelog_name, ) @@ -223,6 +240,14 @@ def __init__(self, consumer: Consumer, topic_manager: TopicManager): self._topic_manager = topic_manager self._recovery_partitions: Dict[int, Dict[str, RecoveryPartition]] = {} + @property + def partitions(self) -> Dict[int, Dict[str, RecoveryPartition]]: + """ + Returns a mapping of assigned RecoveryPartitions in the following format: + {: {: }} + """ + return self._recovery_partitions + @property def has_assignments(self) -> bool: """ @@ -284,29 +309,37 @@ def _generate_recovery_partitions( topic_name: str, partition_num: int, store_partitions: Dict[str, StorePartition], + committed_offset: int, ) -> List[RecoveryPartition]: - recovery_partitions = [ - RecoveryPartition( - changelog_name=self._topic_manager.changelog_topics[topic_name][ - store_name - ].name, + partitions = [] + for store_name, store_partition in store_partitions.items(): + changelog_topic = self._topic_manager.changelog_topics[topic_name][ + store_name + ] + recovery_partition = RecoveryPartition( + changelog_name=changelog_topic.name, partition_num=partition_num, store_partition=store_partition, + committed_offset=committed_offset, ) - for store_name, store_partition in store_partitions.items() - ] - for rp in recovery_partitions: - rp.set_watermarks( - *self._consumer.get_watermark_offsets( - ConfluentPartition(rp.changelog_name, rp.partition_num), timeout=10 - ) + + lowwater, highwater = self._consumer.get_watermark_offsets( + ConfluentPartition( + topic=recovery_partition.changelog_name, + partition=recovery_partition.partition_num, + ), + timeout=10, ) - return recovery_partitions + recovery_partition.set_watermarks(lowwater=lowwater, highwater=highwater) + + partitions.append(recovery_partition) + return partitions def assign_partition( self, - topic_name: str, - partition_num: int, + topic: str, + partition: int, + committed_offset: int, store_partitions: Dict[str, StorePartition], ): """ @@ -315,32 +348,36 @@ def assign_partition( Pauses active consumer partitions as needed. """ recovery_partitions = self._generate_recovery_partitions( - topic_name=topic_name, - partition_num=partition_num, + topic_name=topic, + partition_num=partition, store_partitions=store_partitions, + committed_offset=committed_offset, ) for rp in recovery_partitions: - c_name, p_num = rp.changelog_name, rp.partition_num + changelog_name, partition = rp.changelog_name, rp.partition_num if rp.needs_recovery: - logger.info(f"Recovery required for {c_name}[{p_num}]") - self._recovery_partitions.setdefault(p_num, {})[c_name] = rp + logger.info(f"Recovery required for {changelog_name}[{partition}]") + self._recovery_partitions.setdefault(partition, {})[changelog_name] = rp self._consumer.incremental_assign( - [ConfluentPartition(c_name, p_num, rp.offset)] + [ConfluentPartition(changelog_name, partition, rp.offset)] ) elif rp.needs_offset_update: # nothing to recover, but offset is off...likely that offset > # highwater due to At Least Once processing behavior. rp.update_offset() - # figure out if any pausing is required - if self.recovering: - # was already recovering, so pause source topic only - self._consumer.pause([ConfluentPartition(topic_name, partition_num)]) - logger.info("Continuing recovery...") - elif self.has_assignments: - # pause ALL partitions while we wait for Application to start recovery - # (all newly assigned partitions are available on `.assignment`). - self._consumer.pause(self._consumer.assignment()) + # Figure out if we need to pause any topic partitions + if self._recovery_partitions: + if self._running: + # Some partitions are already recovering, + # pausing only the source topic partition + self._consumer.pause( + [ConfluentPartition(topic=topic, partition=partition)] + ) + else: + # Recovery hasn't started yet, so pause ALL partitions + # and wait for Application to start recovery + self._consumer.pause(self._consumer.assignment()) def _revoke_recovery_partitions( self, diff --git a/quixstreams/state/rocksdb/partition.py b/quixstreams/state/rocksdb/partition.py index 5246fbde0..124ce25a1 100644 --- a/quixstreams/state/rocksdb/partition.py +++ b/quixstreams/state/rocksdb/partition.py @@ -1,12 +1,12 @@ import logging import time -from typing import Any, Union, Optional, List, Dict, Tuple +from typing import Any, Union, Optional, List, Dict -from rocksdict import WriteBatch, Rdict, ColumnFamily, AccessType, WriteOptions +from rocksdict import WriteBatch, Rdict, ColumnFamily, AccessType from quixstreams.models import ConfluentKafkaMessageProto +from quixstreams.utils.json import loads as json_loads from quixstreams.state.recovery import ChangelogProducer -from quixstreams.models.types import MessageHeadersMapping from quixstreams.state.types import ( StorePartition, ) @@ -20,6 +20,7 @@ PROCESSED_OFFSET_KEY, CHANGELOG_OFFSET_KEY, CHANGELOG_CF_MESSAGE_HEADER, + CHANGELOG_PROCESSED_OFFSET_MESSAGE_HEADER, ) from .options import RocksDBOptions from .serialization import ( @@ -100,27 +101,30 @@ def _changelog_recover_flush(self, changelog_offset: int, batch: WriteBatch): self.write(batch) def recover_from_changelog_message( - self, changelog_message: ConfluentKafkaMessageProto + self, changelog_message: ConfluentKafkaMessageProto, committed_offset: int ): """ Updates state from a given changelog message. :param changelog_message: A raw Confluent message read from a changelog topic. + :param committed_offset: latest committed offset for the partition """ - try: - cf_handle = self.get_column_family_handle( - changelog_message.headers()[0][1].decode() - ) - except IndexError: + headers = dict(changelog_message.headers() or ()) + # Parse the column family name from message headers + cf_name = headers.get(CHANGELOG_CF_MESSAGE_HEADER, b"").decode() + if not cf_name: raise ColumnFamilyHeaderMissing( - f"Header '{CHANGELOG_CF_MESSAGE_HEADER}' missing from changelog message!" + f"Header '{CHANGELOG_CF_MESSAGE_HEADER}' missing from changelog message" ) + cf_handle = self.get_column_family_handle(cf_name) + batch = WriteBatch(raw_mode=True) key = changelog_message.key() if value := changelog_message.value(): batch.put(key, value, cf_handle) else: batch.delete(key, cf_handle) + self._changelog_recover_flush(changelog_message.offset(), batch) def set_changelog_offset(self, changelog_offset: int): diff --git a/quixstreams/state/types.py b/quixstreams/state/types.py index 19fde2524..37aa3b6db 100644 --- a/quixstreams/state/types.py +++ b/quixstreams/state/types.py @@ -98,12 +98,13 @@ def begin(self) -> "PartitionTransaction": """ def recover_from_changelog_message( - self, changelog_message: ConfluentKafkaMessageProto + self, changelog_message: ConfluentKafkaMessageProto, committed_offset: int ): """ Updates state from a given changelog message. :param changelog_message: A raw Confluent message read from a changelog topic. + :param committed_offset: latest committed offset for the partition """ ... diff --git a/tests/test_quixstreams/fixtures.py b/tests/test_quixstreams/fixtures.py index ab66f924a..e3698a241 100644 --- a/tests/test_quixstreams/fixtures.py +++ b/tests/test_quixstreams/fixtures.py @@ -342,21 +342,6 @@ def state_manager(state_manager_factory) -> StateStoreManager: manager.close() -@pytest.fixture() -def state_manager_changelogs( - state_manager_factory, - topic_admin, - recovery_manager_mock_consumer, -) -> StateStoreManager: - manager = state_manager_factory( - producer=create_autospec(RowProducer)("broker"), - recovery_manager=recovery_manager_mock_consumer, - ) - manager.init() - yield manager - manager.close() - - @pytest.fixture() def quix_mock_config_builder_factory(kafka_container): def factory(workspace_id: Optional[str] = None): diff --git a/tests/test_quixstreams/test_app.py b/tests/test_quixstreams/test_app.py index 639bbced4..7846117e7 100644 --- a/tests/test_quixstreams/test_app.py +++ b/tests/test_quixstreams/test_app.py @@ -25,7 +25,6 @@ from quixstreams.platforms.quix.env import QuixEnvironment from quixstreams.rowconsumer import RowConsumer from quixstreams.state import State -from tests.utils import TopicPartitionStub def _stop_app_on_future(app: Application, future: Future, timeout: float): @@ -948,7 +947,7 @@ def count(_, state: State): ) state_manager.register_store(topic_in.name, "default") state_manager.on_partition_assign( - TopicPartitionStub(topic=topic_in.name, partition=partition_num) + topic=topic_in.name, partition=partition_num, committed_offset=-1001 ) store = state_manager.get_store(topic=topic_in.name, store_name="default") with store.start_partition_transaction(partition=partition_num) as tx: @@ -1006,7 +1005,7 @@ def count_and_fail(_, state: State): ) state_manager.register_store(topic_in.name, "default") state_manager.on_partition_assign( - TopicPartitionStub(topic=topic_in.name, partition=0) + topic=topic_in.name, partition=0, committed_offset=-1001 ) store = state_manager.get_store(topic=topic_in.name, store_name="default") with store.start_partition_transaction(partition=0) as tx: @@ -1070,7 +1069,7 @@ def fail(_): ) state_manager.register_store(topic_in.name, "default") state_manager.on_partition_assign( - TopicPartitionStub(topic=topic_in.name, partition=partition_num) + topic=topic_in.name, partition=partition_num, committed_offset=-1001 ) store = state_manager.get_store(topic=topic_in.name, store_name="default") with store.start_partition_transaction(partition=partition_num) as tx: @@ -1101,7 +1100,7 @@ def test_on_assign_topic_offset_behind_warning( with state_manager: state_manager.register_store(topic_in.name, "default") state_partitions = state_manager.on_partition_assign( - TopicPartitionStub(topic=topic_in.name, partition=partition_num) + topic=topic_in.name, partition=partition_num, committed_offset=-1001 ) store = state_manager.get_store(topic_in.name, "default") tx = store.start_partition_transaction(partition_num) @@ -1167,7 +1166,7 @@ def test_clear_state( with state_manager: state_manager.register_store(topic_in_name, "default") state_manager.on_partition_assign( - TopicPartitionStub(topic=topic_in_name, partition=0) + topic=topic_in_name, partition=0, committed_offset=-1001 ) store = state_manager.get_store(topic=topic_in_name, store_name="default") with store.start_partition_transaction(partition=0) as tx: @@ -1181,7 +1180,7 @@ def test_clear_state( with state_manager: state_manager.register_store(topic_in_name, "default") state_manager.on_partition_assign( - TopicPartitionStub(topic=topic_in_name, partition=0) + topic=topic_in_name, partition=0, committed_offset=-1001 ) store = state_manager.get_store(topic=topic_in_name, store_name="default") with store.start_partition_transaction(partition=0) as tx: @@ -1252,7 +1251,7 @@ def validate_state(): state_manager.register_store(topic.name, store_name) for p_num, count in partition_msg_count.items(): state_manager.on_partition_assign( - TopicPartitionStub(topic=topic.name, partition=p_num) + topic=topic.name, partition=p_num, committed_offset=-1001 ) store = state_manager.get_store( topic=topic.name, store_name=store_name @@ -1390,7 +1389,7 @@ def validate_state(): state_manager.register_windowed_store(topic.name, store_name) for p_num, windows in expected_window_updates.items(): state_manager.on_partition_assign( - TopicPartitionStub(topic=topic.name, partition=p_num) + topic=topic.name, partition=p_num, committed_offset=-1001 ) store = state_manager.get_store( topic=topic.name, store_name=store_name diff --git a/tests/test_quixstreams/test_dataframe/test_dataframe.py b/tests/test_quixstreams/test_dataframe/test_dataframe.py index 729cadd82..8ba57a724 100644 --- a/tests/test_quixstreams/test_dataframe/test_dataframe.py +++ b/tests/test_quixstreams/test_dataframe/test_dataframe.py @@ -8,7 +8,6 @@ from quixstreams.dataframe.exceptions import InvalidOperation from quixstreams.dataframe.windows import WindowResult from quixstreams.models import MessageTimestamp -from tests.utils import TopicPartitionStub class TestStreamingDataFrame: @@ -467,7 +466,7 @@ def stateful_func(value_: dict, state: State) -> int: sdf = sdf.apply(stateful_func, stateful=True) state_manager.on_partition_assign( - tp=TopicPartitionStub(topic=topic.name, partition=0) + topic=topic.name, partition=0, committed_offset=-1001 ) values = [ {"number": 1}, @@ -506,7 +505,7 @@ def stateful_func(value_: dict, state: State): sdf = sdf.update(stateful_func, stateful=True) state_manager.on_partition_assign( - tp=TopicPartitionStub(topic=topic.name, partition=0) + topic=topic.name, partition=0, committed_offset=-1001 ) result = None values = [ @@ -547,7 +546,7 @@ def stateful_func(value_: dict, state: State): sdf = sdf.filter(lambda v, state: state.get("max") >= 3, stateful=True) state_manager.on_partition_assign( - tp=TopicPartitionStub(topic=topic.name, partition=0) + topic=topic.name, partition=0, committed_offset=-1001 ) values = [ {"number": 1}, @@ -590,7 +589,7 @@ def stateful_func(value_: dict, state: State): sdf = sdf[sdf.apply(lambda v, state: state.get("max") >= 3, stateful=True)] state_manager.on_partition_assign( - tp=TopicPartitionStub(topic=topic.name, partition=0) + topic=topic.name, partition=0, committed_offset=-1001 ) values = [ {"number": 1}, @@ -670,7 +669,7 @@ def test_tumbling_window_current( ) state_manager.on_partition_assign( - tp=TopicPartitionStub(topic=topic.name, partition=0) + topic=topic.name, partition=0, committed_offset=-1001 ) messages = [ # Message early in the window @@ -710,7 +709,7 @@ def test_tumbling_window_current_out_of_order_late( sdf = sdf.tumbling_window(duration_ms=10, grace_ms=0).sum().current() state_manager.on_partition_assign( - tp=TopicPartitionStub(topic=topic.name, partition=0) + topic=topic.name, partition=0, committed_offset=-1001 ) messages = [ # Create window [0, 10) @@ -748,7 +747,7 @@ def test_tumbling_window_final( sdf = sdf.tumbling_window(duration_ms=10, grace_ms=0).sum().final() state_manager.on_partition_assign( - tp=TopicPartitionStub(topic=topic.name, partition=0) + topic=topic.name, partition=0, committed_offset=-1001 ) messages = [ # Create window [0, 10) @@ -786,7 +785,7 @@ def test_tumbling_window_none_key_messages( sdf = sdf.tumbling_window(duration_ms=10).sum().current() state_manager.on_partition_assign( - tp=TopicPartitionStub(topic=topic.name, partition=0) + topic=topic.name, partition=0, committed_offset=-1001 ) messages = [ # Create window [0,10) @@ -890,7 +889,7 @@ def test_hopping_window_current( sdf = sdf.hopping_window(duration_ms=10, step_ms=5).sum().current() state_manager.on_partition_assign( - tp=TopicPartitionStub(topic=topic.name, partition=0) + topic=topic.name, partition=0, committed_offset=-1001 ) messages = [ # Create window [0,10) @@ -936,7 +935,7 @@ def test_hopping_window_current_out_of_order_late( sdf = sdf.hopping_window(duration_ms=10, step_ms=5).sum().current() state_manager.on_partition_assign( - tp=TopicPartitionStub(topic=topic.name, partition=0) + topic=topic.name, partition=0, committed_offset=-1001 ) messages = [ # Create window [0,10) @@ -976,7 +975,7 @@ def test_hopping_window_final( sdf = sdf.hopping_window(duration_ms=10, step_ms=5).sum().final() state_manager.on_partition_assign( - tp=TopicPartitionStub(topic=topic.name, partition=0) + topic=topic.name, partition=0, committed_offset=-1001 ) messages = [ # Create window [0,10) @@ -1018,7 +1017,7 @@ def test_hopping_window_none_key_messages( sdf = sdf.hopping_window(duration_ms=10, step_ms=5).sum().current() state_manager.on_partition_assign( - tp=TopicPartitionStub(topic=topic.name, partition=0) + topic=topic.name, partition=0, committed_offset=-1001 ) messages = [ # Create window [0,10) diff --git a/tests/test_quixstreams/test_models/fixtures.py b/tests/test_quixstreams/test_models/fixtures.py index 332849ff0..f0e7c7f30 100644 --- a/tests/test_quixstreams/test_models/fixtures.py +++ b/tests/test_quixstreams/test_models/fixtures.py @@ -5,7 +5,7 @@ import pytest -from ..utils import ConfluentKafkaMessageStub +from tests.utils import ConfluentKafkaMessageStub @pytest.fixture() diff --git a/tests/test_quixstreams/test_models/test_topics/test_topics.py b/tests/test_quixstreams/test_models/test_topics/test_topics.py index 3ec4e8121..aa63e37ca 100644 --- a/tests/test_quixstreams/test_models/test_topics/test_topics.py +++ b/tests/test_quixstreams/test_models/test_topics/test_topics.py @@ -26,8 +26,8 @@ SERIALIZERS, DESERIALIZERS, ) +from tests.utils import ConfluentKafkaMessageStub from ..utils import int_to_bytes, float_to_bytes -from ...utils import ConfluentKafkaMessageStub class JSONListDeserializer(JSONDeserializer): diff --git a/tests/test_quixstreams/test_state/fixtures.py b/tests/test_quixstreams/test_state/fixtures.py index 29a2eadda..20184a187 100644 --- a/tests/test_quixstreams/test_state/fixtures.py +++ b/tests/test_quixstreams/test_state/fixtures.py @@ -1,68 +1,47 @@ import uuid from typing import Optional -from unittest.mock import create_autospec +from unittest.mock import MagicMock import pytest from quixstreams.kafka import Consumer +from quixstreams.models import TopicManager from quixstreams.state.recovery import RecoveryPartition, RecoveryManager from quixstreams.state.types import StorePartition @pytest.fixture() def recovery_manager_factory(topic_manager_factory): - def factory(consumer: Consumer) -> RecoveryManager: - return RecoveryManager(topic_manager=topic_manager_factory(), consumer=consumer) + def factory( + topic_manager: Optional[TopicManager] = None, + consumer: Optional[Consumer] = None, + ) -> RecoveryManager: + topic_manager = topic_manager or topic_manager_factory() + consumer = consumer or MagicMock(Consumer) + return RecoveryManager(topic_manager=topic_manager, consumer=consumer) return factory -@pytest.fixture() -def recovery_partition_store_mock(): - store = create_autospec(StorePartition)() - store.get_changelog_offset.return_value = 15 - recovery_partition = RecoveryPartition( - changelog_name=f"changelog__{str(uuid.uuid4())}", - partition_num=0, - store_partition=store, - ) - recovery_partition._changelog_lowwater = 10 - recovery_partition._changelog_highwater = 20 - return recovery_partition - - @pytest.fixture() def recovery_partition_factory(): """Mocks a StorePartition if none provided""" def factory( - changelog_name: str = str(uuid.uuid4()), + changelog_name: str = "", partition_num: int = 0, - mocked_changelog_offset: Optional[int] = 15, - lowwater: Optional[int] = None, - highwater: Optional[int] = None, store_partition: Optional[StorePartition] = None, + committed_offset: int = -1001, ): + changelog_name = changelog_name or f"changelog__{str(uuid.uuid4())}" if not store_partition: - store_partition = create_autospec(StorePartition)() - store_partition.get_changelog_offset.return_value = mocked_changelog_offset + store_partition = MagicMock(spec_set=StorePartition) recovery_partition = RecoveryPartition( changelog_name=changelog_name, partition_num=partition_num, store_partition=store_partition, + committed_offset=committed_offset, ) - if lowwater: - recovery_partition._changelog_lowwater = lowwater - if highwater: - recovery_partition._changelog_highwater = highwater return recovery_partition return factory - - -@pytest.fixture() -def recovery_manager_mock_consumer(topic_manager_factory): - return RecoveryManager( - consumer=create_autospec(Consumer)("broker", "group", "latest"), - topic_manager=topic_manager_factory(), - ) diff --git a/tests/test_quixstreams/test_state/test_manager.py b/tests/test_quixstreams/test_state/test_manager.py index 1fd19b560..1043171f3 100644 --- a/tests/test_quixstreams/test_state/test_manager.py +++ b/tests/test_quixstreams/test_state/test_manager.py @@ -1,15 +1,15 @@ import os import uuid -from unittest.mock import patch, call +from unittest.mock import MagicMock import pytest +from quixstreams.kafka import Consumer from quixstreams.state.exceptions import ( StoreNotRegisteredError, PartitionStoreIsUsed, WindowedStoreAlreadyRegisteredError, ) -from quixstreams.state.recovery import ChangelogProducerFactory from tests.utils import TopicPartitionStub @@ -40,11 +40,11 @@ def test_init_state_dir_exists_not_a_dir_fails( ... def test_rebalance_partitions_stores_not_registered(self, state_manager): - tp = TopicPartitionStub("topic", 0) # It's ok to rebalance partitions when there are no stores registered - state_manager.on_partition_assign(tp) - state_manager.on_partition_revoke(tp) - state_manager.on_partition_lost(tp) + state_manager.on_partition_assign( + topic="topic", partition=0, committed_offset=-1001 + ) + state_manager.on_partition_revoke(topic="topic", partition=0) def test_register_store(self, state_manager): state_manager = state_manager @@ -66,7 +66,11 @@ def test_assign_revoke_partitions_stores_registered(self, state_manager): store_partitions = [] for tp in partitions: - store_partitions.extend(state_manager.on_partition_assign(tp)) + store_partitions.extend( + state_manager.on_partition_assign( + topic=tp.topic, partition=tp.partition, committed_offset=-1001 + ) + ) assert len(store_partitions) == 3 assert len(state_manager.get_store("topic1", "store1").partitions) == 1 @@ -74,33 +78,7 @@ def test_assign_revoke_partitions_stores_registered(self, state_manager): assert len(state_manager.get_store("topic2", "store1").partitions) == 1 for tp in partitions: - state_manager.on_partition_revoke(tp) - - assert not state_manager.get_store("topic1", "store1").partitions - assert not state_manager.get_store("topic1", "store2").partitions - assert not state_manager.get_store("topic2", "store1").partitions - - def test_assign_lose_partitions_stores_registered(self, state_manager): - state_manager.register_store("topic1", store_name="store1") - state_manager.register_store("topic1", store_name="store2") - state_manager.register_store("topic2", store_name="store1") - - stores_list = [s for d in state_manager.stores.values() for s in d.values()] - assert len(stores_list) == 3 - - partitions = [ - TopicPartitionStub("topic1", 0), - TopicPartitionStub("topic2", 0), - ] - - for tp in partitions: - state_manager.on_partition_assign(tp) - assert len(state_manager.get_store("topic1", "store1").partitions) == 1 - assert len(state_manager.get_store("topic1", "store2").partitions) == 1 - assert len(state_manager.get_store("topic2", "store1").partitions) == 1 - - for tp in partitions: - state_manager.on_partition_lost(tp) + state_manager.on_partition_revoke(topic=tp.topic, partition=tp.partition) assert not state_manager.get_store("topic1", "store1").partitions assert not state_manager.get_store("topic1", "store2").partitions @@ -138,7 +116,9 @@ def test_clear_stores(self, state_manager): # Assign partitions for tp in partitions: - state_manager.on_partition_assign(tp) + state_manager.on_partition_assign( + topic=tp.topic, partition=tp.partition, committed_offset=-1001 + ) # Collect paths of stores to be deleted stores_to_delete = [ @@ -150,7 +130,7 @@ def test_clear_stores(self, state_manager): # Revoke partitions for tp in partitions: - state_manager.on_partition_revoke(tp) + state_manager.on_partition_revoke(topic=tp.topic, partition=tp.partition) # Act - Delete stores state_manager.clear_stores() @@ -163,89 +143,76 @@ def test_clear_stores_fails(self, state_manager): # Register stores state_manager.register_store("topic1", store_name="store1") - # Define the partition - partition = TopicPartitionStub("topic1", 0) - # Assign the partition - state_manager.on_partition_assign(partition) + state_manager.on_partition_assign( + topic="topic1", partition=0, committed_offset=-1001 + ) # Act - Delete stores with pytest.raises(PartitionStoreIsUsed): state_manager.clear_stores() -class TestStateStoreManagerChangelog: - def test_rebalance_partitions_stores_not_registered(self, state_manager_changelogs): - state_manager = state_manager_changelogs - tp = TopicPartitionStub("topic", 0) +class TestStateStoreManagerWithRecovery: + def test_rebalance_partitions_stores_not_registered( + self, state_manager_factory, recovery_manager_factory + ): + state_manager = state_manager_factory( + recovery_manager=recovery_manager_factory() + ) # It's ok to rebalance partitions when there are no stores registered - state_manager.on_partition_assign(tp) - state_manager.on_partition_revoke(tp) - state_manager.on_partition_lost(tp) + state_manager.on_partition_assign( + topic="topic", partition=0, committed_offset=-1001 + ) + state_manager.on_partition_revoke(topic="topic", partition=0) + + def test_register_store( + self, state_manager_factory, recovery_manager_factory, topic_manager_factory + ): + topic_manager = topic_manager_factory() + recovery_manager = recovery_manager_factory(topic_manager=topic_manager) + state_manager = state_manager_factory(recovery_manager=recovery_manager) - def test_register_store(self, state_manager_changelogs): - state_manager = state_manager_changelogs - topic_manager = state_manager._recovery_manager._topic_manager + # Create a topic topic = topic_manager.topic(name="topic1") + + # Register a store store_name = "default" state_manager.register_store(topic.name, store_name=store_name) - assert store_name in state_manager._stores[topic.name] + # Check that the store is registered + assert store_name in state_manager.stores[topic.name] + # Check that changelog topic is created assert store_name in topic_manager.changelog_topics[topic.name] def test_assign_revoke_partitions_stores_registered( - self, - state_manager_changelogs, + self, state_manager_factory, recovery_manager_factory, topic_manager_factory ): - state_manager = state_manager_changelogs - recovery_manager = state_manager._recovery_manager - topic_manager = recovery_manager._topic_manager - - changelog_assign = patch.object(recovery_manager, "assign_partition").start() - changelog_revoke = patch.object(recovery_manager, "revoke_partition").start() - topic_manager.topic(name="topic1") - topic_manager.topic(name="topic2") - state_manager.register_store("topic1", store_name="store1") - state_manager.register_store("topic1", store_name="store2") - state_manager.register_store("topic2", store_name="store1") - - stores_list = [s for d in state_manager.stores.values() for s in d.values()] - assert len(stores_list) == 3 - - partitions = [ - TopicPartitionStub("topic1", 0), - TopicPartitionStub("topic2", 0), - ] - - store_partitions = [] - assign_calls = [] - for tp in partitions: - store_partitions.extend(state_manager.on_partition_assign(tp)) - assign_calls.append( - call( - tp.topic, - tp.partition, - { - name: store.partitions[tp.partition] - for name, store in state_manager._stores[tp.topic].items() - }, - ) - ) - assert changelog_assign.call_count == len(assign_calls) - assert len(store_partitions) == 3 - - for store in stores_list: - assert len(store.partitions) == 1 - assert isinstance( - store._changelog_producer_factory, ChangelogProducerFactory - ) - - revoke_calls = [] - for tp in partitions: - state_manager.on_partition_revoke(tp) - revoke_calls.append(call(tp.partition)) - changelog_revoke.assert_has_calls(revoke_calls) - assert changelog_revoke.call_count == len(revoke_calls) - - for store in stores_list: - assert not store.partitions + topic_manager = topic_manager_factory() + consumer = MagicMock(spec_set=Consumer) + consumer.get_watermark_offsets.return_value = (0, 10) + recovery_manager = recovery_manager_factory( + topic_manager=topic_manager, consumer=consumer + ) + state_manager = state_manager_factory(recovery_manager=recovery_manager) + topic_name = "topic1" + partition = 0 + topic_manager.topic(name=topic_name) + store_name = "store1" + + # Register a store + state_manager.register_store(topic_name, store_name=store_name) + + # Assign a topic partition + state_manager.on_partition_assign( + topic=topic_name, partition=partition, committed_offset=-1001 + ) + + # Check that RecoveryManager has a partition assigned + assert recovery_manager.partitions + + # Revoke a topic partition + state_manager.on_partition_revoke(topic=topic_name, partition=partition) + + # Check that RecoveryManager has a partition revoked too + assert not recovery_manager.partitions diff --git a/tests/test_quixstreams/test_state/test_recovery.py b/tests/test_quixstreams/test_state/test_recovery.py deleted file mode 100644 index dab1d51dc..000000000 --- a/tests/test_quixstreams/test_state/test_recovery.py +++ /dev/null @@ -1,550 +0,0 @@ -import logging -import uuid -from unittest.mock import patch - -from quixstreams.state.recovery import ChangelogProducer -from confluent_kafka import TopicPartition as ConfluentPartition -from quixstreams.state.recovery import ChangelogProducerFactory -from ..utils import ConfluentKafkaMessageStub - - -class TestRecoveryPartition: - def test_set_watermarks(self, recovery_partition_store_mock): - recovery_partition = recovery_partition_store_mock - recovery_partition.set_watermarks(50, 100) - assert recovery_partition._changelog_lowwater == 50 - assert recovery_partition._changelog_highwater == 100 - - def test_needs_recovery(self, recovery_partition_store_mock): - recovery_partition = recovery_partition_store_mock - assert recovery_partition.needs_recovery - - def test_needs_recovery_caught_up(self, recovery_partition_store_mock): - recovery_partition = recovery_partition_store_mock - recovery_partition.store_partition.get_changelog_offset.return_value = 20 - assert not recovery_partition_store_mock.needs_recovery - - def test_needs_recovery_no_valid_offsets(self, recovery_partition_factory): - # Create a RecoveryPartition with the offset ahead of the watermark - recovery_partition = recovery_partition_factory(mocked_changelog_offset=101) - recovery_partition.set_watermarks(100, 100) - assert not recovery_partition.needs_recovery - assert recovery_partition.needs_offset_update - - def test_recover(self, recovery_partition_store_mock): - recovery_partition = recovery_partition_store_mock - msg = ConfluentKafkaMessageStub() - recovery_partition.recover_from_changelog_message(msg) - recovery_partition.store_partition.recover_from_changelog_message.assert_called_with( - changelog_message=msg - ) - - def test_update_offset(self, recovery_partition_store_mock): - recovery_partition = recovery_partition_store_mock - recovery_partition.update_offset() - recovery_partition.store_partition.set_changelog_offset.assert_called_with( - recovery_partition._changelog_highwater - 1 - ) - - def test_update_offset_warn(self, recovery_partition_store_mock, caplog): - """ - A warning is thrown if the stored changelog offset is higher than the highwater - """ - recovery_partition = recovery_partition_store_mock - recovery_partition.store_partition.get_changelog_offset.return_value = ( - recovery_partition._changelog_highwater + 1 - ) - with caplog.at_level(level=logging.WARNING): - recovery_partition.update_offset() - assert caplog.text != "" - - -class TestChangelogProducer: - def test_produce( - self, topic_manager_factory, row_producer_factory, consumer_factory - ): - p_num = 2 - cf_header = "my_cf_header" - cf = "my_cf" - expected = { - "key": b"my_key", - "value": b"10", - "headers": [(cf_header, cf.encode())], - "partition": p_num, - } - topic_manager = topic_manager_factory() - changelog = topic_manager.topic( - name=str(uuid.uuid4()), - key_serializer="bytes", - value_serializer="bytes", - config=topic_manager.topic_config(num_partitions=3), - ) - source_topic_name = "source-topic" - topic_manager.create_topics([changelog]) - - producer = ChangelogProducer( - changelog_name=changelog.name, - partition=p_num, - source_topic_name=source_topic_name, - producer=row_producer_factory(), - ) - producer.produce( - **{k: v for k, v in expected.items() if k in ["key", "value"]}, - headers={cf_header: cf}, - ) - producer.flush() - - consumer = consumer_factory(auto_offset_reset="earliest") - consumer.subscribe([changelog.name]) - message = consumer.poll(10) - - for k in expected: - assert getattr(message, k)() == expected[k] - - -class TestChangelogProducerFactory: - def test_get_partition_producer(self, row_producer_factory): - changelog_name = "changelog__topic" - source_topic_name = "source-topic" - producer = row_producer_factory() - - p_num = 1 - - changelog_producer = ChangelogProducerFactory( - changelog_name=changelog_name, - producer=producer, - source_topic_name=source_topic_name, - ).get_partition_producer(partition_num=p_num) - assert changelog_producer.changelog_name == changelog_name - assert changelog_producer.partition == p_num - assert changelog_producer.source_topic_name == source_topic_name - - -class TestRecoveryManager: - def test_register_changelog(self, recovery_manager_mock_consumer): - recovery_manager = recovery_manager_mock_consumer - topic_manager = recovery_manager._topic_manager - store_name = "my_store" - kwargs = dict( - topic_name="my_topic_name", - consumer_group="my_group", - ) - with patch.object(topic_manager, "changelog_topic") as make_changelog: - recovery_manager.register_changelog(**kwargs, store_name=store_name) - make_changelog.assert_called_with(**kwargs, store_name=store_name) - - def test_assign_partition(self, state_manager_changelogs): - """ - From two `Store`s `StorePartition`s (partition 1), assign the partition - ("window") that needs recovery. - - No recovery underway yet, so should pause all partitions. - """ - state_manager = state_manager_changelogs - recovery_manager = state_manager._recovery_manager - topic_manager = recovery_manager._topic_manager - consumer = recovery_manager._consumer - expected_store_name = "window" - expected_offset = 15 - - topic_name = "topic_name" - topic_manager.topic(topic_name) - partition_num = 1 - consumer.get_watermark_offsets.side_effect = [(0, 10), (0, 20)] - consumer.assignment.return_value = "assignments" - - # setup state_managers `StorePartitions` (which also sets up changelog topics) - store_partitions = {} - for store_name, offset in [ - ("default", 10), - (expected_store_name, expected_offset), - ]: - state_manager.register_store(topic_name=topic_name, store_name=store_name) - partition = state_manager.get_store( - topic=topic_name, store_name=store_name - ).assign_partition(partition_num) - patch.object(partition, "get_changelog_offset", return_value=offset).start() - store_partitions[store_name] = partition - - recovery_manager.assign_partition( - topic_name=topic_name, - partition_num=partition_num, - store_partitions=store_partitions, - ) - - # expected changelog topic's partition was subscribed to - expected_changelog_name = topic_manager.changelog_topics[topic_name][ - expected_store_name - ].name - assign_calls = consumer.incremental_assign.call_args_list[0].args - assert len(assign_calls) == 1 - partition_list = assign_calls[0] - assert isinstance(partition_list, list) - assert len(assign_calls) == 1 - confluent_partition = partition_list[0] - assert isinstance(confluent_partition, ConfluentPartition) - assert expected_changelog_name == confluent_partition.topic - assert partition_num == confluent_partition.partition - assert expected_offset == confluent_partition.offset - - # recovery manager should also store respective RecoveryPartition - assert recovery_manager._recovery_partitions[partition_num][ - expected_changelog_name - ] - assert len(recovery_manager._recovery_partitions[partition_num]) == 1 - - # should pause ALL partitions - consumer.pause.assert_called_with("assignments") - - def test_assign_partition_fix_offset_only( - self, recovery_manager_mock_consumer, recovery_partition_factory - ): - """ - From two RecoveryPartitions, fix the one ("window") that has a bad offset. - - No recovery was previously going, and an offset fix will not trigger one. - """ - recovery_manager = recovery_manager_mock_consumer - topic_name = "topic_name" - partition_num = 1 - store_names = ["default", "window"] - changelog_names = [f"changelog__{topic_name}__{store}" for store in store_names] - watermarks = [(0, 10), (0, 20)] - changelog_offsets = [10, 22] - - consumer = recovery_manager._consumer - consumer.assignment.return_value = "assignments" - - recovery_partitions = [ - recovery_partition_factory( - changelog_name=changelog_names[i], - partition_num=partition_num, - mocked_changelog_offset=changelog_offsets[i], - lowwater=watermarks[i][0], - highwater=watermarks[i][1], - ) - for i in range(len(store_names)) - ] - patch.object( - recovery_manager, - "_generate_recovery_partitions", - return_value=recovery_partitions, - ).start() - with patch.object(recovery_partitions[1], "update_offset") as update_offset: - recovery_manager.assign_partition( - topic_name=topic_name, - partition_num=partition_num, - store_partitions="mocked_out", - ) - - # no pause or assignments should be called - consumer.pause.assert_not_called() - consumer.incremental_assign.assert_not_called() - update_offset.assert_called() - - def test_assign_partition_fix_offset_during_recovery( - self, recovery_manager_mock_consumer, recovery_partition_factory - ): - """ - From two RecoveryPartitions, fix the one ("window") that has a bad offset. - - Recovery was previously going, so must pause the source topic. - """ - recovery_manager = recovery_manager_mock_consumer - recovery_manager._running = True - topic_name = "topic_name" - partition_num = 1 - store_names = ["default", "window"] - changelog_names = [f"changelog__{topic_name}__{store}" for store in store_names] - watermarks = [(0, 10), (0, 20)] - changelog_offsets = [10, 22] - - consumer = recovery_manager._consumer - - # already in the middle of recovering - recovery_manager._recovery_partitions.setdefault(2, {})[ - changelog_offsets[0] - ] = recovery_partition_factory( - changelog_name=changelog_names[0], - partition_num=2, - ) - assert recovery_manager.recovering - - recovery_partitions = [ - recovery_partition_factory( - changelog_name=changelog_names[i], - partition_num=partition_num, - mocked_changelog_offset=changelog_offsets[i], - lowwater=watermarks[i][0], - highwater=watermarks[i][1], - ) - for i in range(len(store_names)) - ] - - patch.object( - recovery_manager, - "_generate_recovery_partitions", - return_value=recovery_partitions, - ).start() - - with patch.object(recovery_partitions[1], "update_offset") as update_offset: - recovery_manager.assign_partition( - topic_name=topic_name, - partition_num=partition_num, - store_partitions="mocked", - ) - - pause_call = consumer.pause.call_args_list[0].args - assert len(pause_call) == 1 - assert isinstance(pause_call[0], list) - assert len(pause_call[0]) == 1 - assert isinstance(pause_call[0][0], ConfluentPartition) - assert topic_name == pause_call[0][0].topic - assert partition_num == pause_call[0][0].partition - - consumer.incremental_assign.assert_not_called() - update_offset.assert_called() - - def test_assign_partitions_during_recovery( - self, recovery_manager_mock_consumer, recovery_partition_factory - ): - """ - From two RecoveryPartitions, assign the one ("window") that needs recovery. - - RecoveryManager is currently recovering, so should only pause source topic. - """ - recovery_manager = recovery_manager_mock_consumer - recovery_manager._running = True - topic_name = "topic_name" - partition_num = 1 - store_names = ["default", "window"] - changelog_names = [f"changelog__{topic_name}__{store}" for store in store_names] - watermarks = [(0, 10), (0, 20)] - changelog_offsets = [10, 15] - - consumer = recovery_manager._consumer - consumer.get_watermark_offsets.side_effect = watermarks - - # already in the middle of recovering - recovery_manager._recovery_partitions.setdefault(2, {})[ - changelog_offsets[0] - ] = recovery_partition_factory( - changelog_name=changelog_names[0], - partition_num=2, - ) - assert recovery_manager.recovering - - recovery_partitions = [ - recovery_partition_factory( - changelog_name=changelog_names[i], - partition_num=partition_num, - mocked_changelog_offset=changelog_offsets[i], - lowwater=watermarks[i][0], - highwater=watermarks[i][1], - ) - for i in range(len(store_names)) - ] - skip_recover_partition = recovery_partitions[0] - should_recover_partition = recovery_partitions[1] - - patch.object( - recovery_manager, - "_generate_recovery_partitions", - return_value=recovery_partitions, - ).start() - recovery_manager.assign_partition( - topic_name=topic_name, - partition_num=partition_num, - store_partitions="mocked_out", - ) - - # should only pause the source topic partition since currently recovering - pause_call = consumer.pause.call_args_list[0].args - assert len(pause_call) == 1 - assert isinstance(pause_call[0], list) - assert len(pause_call[0]) == 1 - assert isinstance(pause_call[0][0], ConfluentPartition) - assert topic_name == pause_call[0][0].topic - assert partition_num == pause_call[0][0].partition - - # should only assign the partition that needs recovery - assign_call = consumer.incremental_assign.call_args_list[0].args - assert len(assign_call) == 1 - assert isinstance(assign_call[0], list) - assert len(assign_call[0]) == 1 - assert isinstance(assign_call[0][0], ConfluentPartition) - assert should_recover_partition.changelog_name == assign_call[0][0].topic - assert should_recover_partition.partition_num == assign_call[0][0].partition - assert should_recover_partition.offset == assign_call[0][0].offset - assert ( - recovery_manager._recovery_partitions[partition_num][ - should_recover_partition.changelog_name - ] - == should_recover_partition - ) - assert ( - skip_recover_partition.changelog_name - not in recovery_manager._recovery_partitions[partition_num] - ) - - def test__revoke_recovery_partition( - self, recovery_manager_mock_consumer, recovery_partition_factory - ): - recovery_manager = recovery_manager_mock_consumer - consumer = recovery_manager._consumer - topic_name = "topic_name" - partition_num = 1 - changelog_names = [ - f"changelog__{topic_name}__{store_name}" - for store_name in ["default", "window"] - ] - - recovery_manager._recovery_partitions = { - partition_num: { - changelog_name: recovery_partition_factory( - changelog_name=changelog_name, - partition_num=partition_num, - ) - for changelog_name in changelog_names - } - } - - recovery_manager.revoke_partition(partition_num=partition_num) - - unassign_call = consumer.incremental_unassign.call_args_list[0].args - assert len(unassign_call) == 1 - assert isinstance(unassign_call[0], list) - assert len(unassign_call[0]) == 2 - for idx, confluent_partition in enumerate(unassign_call[0]): - assert isinstance(confluent_partition, ConfluentPartition) - assert changelog_names[idx] == confluent_partition.topic - assert partition_num == confluent_partition.partition - assert not recovery_manager._recovery_partitions - - def test_revoke_partitions( - self, recovery_manager_mock_consumer, recovery_partition_factory - ): - """ - Revoke a topic partition's respective recovery partitions. - """ - recovery_manager = recovery_manager_mock_consumer - topic_name = "topic_name" - partition_num = 1 - changelog_name = f"changelog__{topic_name}__default" - recovery_partition = ( - recovery_partition_factory( - changelog_name=changelog_name, - partition_num=partition_num, - ), - ) - recovery_manager._recovery_partitions = { - partition_num: {changelog_name: recovery_partition} - } - - with patch.object(recovery_manager, "_revoke_recovery_partitions") as revoke: - recovery_manager.revoke_partition(partition_num=partition_num) - - revoke.assert_called_with([recovery_partition], partition_num) - - def test_revoke_partition_not_assigned(self, recovery_manager_mock_consumer): - """ - Skip revoking any recovery partitions for a given partition since none are - currently assigned (due to not needing recovery). - """ - recovery_manager = recovery_manager_mock_consumer - with patch.object(recovery_manager, "_revoke_recovery_partitions") as revoke: - recovery_manager.revoke_partition(partition_num=1) - - revoke.assert_not_called() - - def test_do_recovery( - self, recovery_manager_mock_consumer, recovery_partition_factory - ): - recovery_manager = recovery_manager_mock_consumer - topic_name = "topic_name" - partition_num = 1 - store_names = ["default", "window"] - changelog_names = [f"changelog__{topic_name}__{store}" for store in store_names] - watermarks = [(0, 10), (0, 20)] - changelog_offsets = [0, 0] - - consumer = recovery_manager._consumer - consumer.assignment.return_value = ["assignments"] - - recovery_partitions = [ - recovery_partition_factory( - changelog_name=changelog_names[i], - partition_num=partition_num, - mocked_changelog_offset=changelog_offsets[i], - lowwater=watermarks[i][0], - highwater=watermarks[i][1], - ) - for i in range(len(store_names)) - ] - - patch.object( - recovery_manager, - "_generate_recovery_partitions", - return_value=recovery_partitions, - ).start() - - recovery_manager.assign_partition( - topic_name=topic_name, - partition_num=partition_num, - store_partitions="mocked_out", - ) - with patch.object(recovery_manager, "_recovery_loop") as recovery_loop: - recovery_manager.do_recovery() - - changelog_resume_args = consumer.resume.call_args_list[0].args[0] - print(changelog_resume_args) - assert len(changelog_resume_args) == 2 - for idx, tp in enumerate(changelog_resume_args): - assert recovery_partitions[idx].changelog_name == tp.topic - assert recovery_partitions[idx].partition_num == tp.partition - recovery_loop.assert_called() - assert consumer.resume.call_args_list[1].args[0] == ["assignments"] - assert consumer.resume.call_count == 2 - - def test__recovery_loop( - self, recovery_manager_mock_consumer, recovery_partition_factory - ): - """ - Successfully recover from a changelog message, which is also the last one - for the partition, so revoke it afterward. - """ - recovery_manager = recovery_manager_mock_consumer - recovery_manager._running = True - consumer = recovery_manager._consumer - topic_name = "topic_name" - changelog_name = f"changelog__{topic_name}__default" - highwater = 20 - partition_num = 1 - msg = ConfluentKafkaMessageStub( - topic=changelog_name, partition=partition_num, offset=highwater - 1 - ) - consumer.poll.return_value = msg - rp = recovery_partition_factory( - changelog_name=changelog_name, - partition_num=partition_num, - mocked_changelog_offset=highwater, # referenced AFTER recovering from msg - lowwater=0, - highwater=highwater, - ) - recovery_manager._recovery_partitions.setdefault(partition_num, {})[ - changelog_name - ] = rp - - recovery_manager._recovery_loop() - - rp.store_partition.recover_from_changelog_message.assert_called_with( - changelog_message=msg - ) - consumer.incremental_unassign.assert_called() - - def test__recovery_loop_no_partitions(self, recovery_manager_mock_consumer): - recovery_manager = recovery_manager_mock_consumer - consumer = recovery_manager._consumer - - recovery_manager._recovery_loop() - consumer.poll.assert_not_called() diff --git a/tests/test_quixstreams/test_state/test_recovery/__init__.py b/tests/test_quixstreams/test_state/test_recovery/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/test_quixstreams/test_state/test_recovery/test_changelog_producer.py b/tests/test_quixstreams/test_state/test_recovery/test_changelog_producer.py new file mode 100644 index 000000000..9263a3f9b --- /dev/null +++ b/tests/test_quixstreams/test_state/test_recovery/test_changelog_producer.py @@ -0,0 +1,64 @@ +import uuid + +from quixstreams.state import ChangelogProducer, ChangelogProducerFactory + + +class TestChangelogProducer: + def test_produce( + self, topic_manager_factory, row_producer_factory, consumer_factory + ): + p_num = 2 + cf_header = "my_cf_header" + cf = "my_cf" + expected = { + "key": b"my_key", + "value": b"10", + "headers": [(cf_header, cf.encode())], + "partition": p_num, + } + topic_manager = topic_manager_factory() + changelog = topic_manager.topic( + name=str(uuid.uuid4()), + key_serializer="bytes", + value_serializer="bytes", + config=topic_manager.topic_config(num_partitions=3), + ) + source_topic_name = "source-topic" + topic_manager.create_topics([changelog]) + + producer = ChangelogProducer( + changelog_name=changelog.name, + partition=p_num, + source_topic_name=source_topic_name, + producer=row_producer_factory(), + ) + producer.produce( + **{k: v for k, v in expected.items() if k in ["key", "value"]}, + headers={cf_header: cf}, + ) + producer.flush() + + consumer = consumer_factory(auto_offset_reset="earliest") + consumer.subscribe([changelog.name]) + message = consumer.poll(10) + + for k in expected: + assert getattr(message, k)() == expected[k] + + +class TestChangelogProducerFactory: + def test_get_partition_producer(self, row_producer_factory): + changelog_name = "changelog__topic" + source_topic_name = "source-topic" + producer = row_producer_factory() + + p_num = 1 + + changelog_producer = ChangelogProducerFactory( + changelog_name=changelog_name, + producer=producer, + source_topic_name=source_topic_name, + ).get_partition_producer(partition_num=p_num) + assert changelog_producer.changelog_name == changelog_name + assert changelog_producer.partition == p_num + assert changelog_producer.source_topic_name == source_topic_name diff --git a/tests/test_quixstreams/test_state/test_recovery/test_recovery_manager.py b/tests/test_quixstreams/test_state/test_recovery/test_recovery_manager.py new file mode 100644 index 000000000..e27895fa6 --- /dev/null +++ b/tests/test_quixstreams/test_state/test_recovery/test_recovery_manager.py @@ -0,0 +1,375 @@ +from unittest.mock import patch, MagicMock + +from confluent_kafka import TopicPartition as ConfluentPartition + +from quixstreams.kafka import Consumer +from quixstreams.models import TopicManager, TopicConfig +from quixstreams.state import RecoveryPartition +from quixstreams.state.rocksdb import RocksDBStorePartition +from quixstreams.state.rocksdb.metadata import CHANGELOG_CF_MESSAGE_HEADER +from tests.utils import ConfluentKafkaMessageStub + + +class TestRecoveryManager: + def test_register_changelog(self, recovery_manager_factory): + recovery_manager = recovery_manager_factory() + + store_name = "my_store" + kwargs = dict( + topic_name="my_topic_name", + consumer_group="my_group", + ) + with patch.object(TopicManager, "changelog_topic") as make_changelog: + recovery_manager.register_changelog(**kwargs, store_name=store_name) + + make_changelog.assert_called_with(**kwargs, store_name=store_name) + + def test_assign_partition( + self, state_manager_factory, recovery_manager_factory, topic_manager_factory + ): + """ + Check that RecoveryManager.assign_partition() assigns proper changelog topic + partition and pauses the consumer. + """ + + store_name = "default" + # Stored changelog offset is between lowwater and highwater, so the + # given store partition needs to be recovered. + lowwater, highwater = 0, 20 + stored_changelog_offset = 15 + + topic_name = "topic_name" + partition_num = 0 + + consumer = MagicMock(spec_set=Consumer) + topic_manager = topic_manager_factory() + recovery_manager = recovery_manager_factory( + consumer=consumer, topic_manager=topic_manager + ) + state_manager = state_manager_factory(recovery_manager=recovery_manager) + + # Create a topic + topic_manager.topic(topic_name) + # Mock the topic watermarks + consumer.get_watermark_offsets.side_effect = [(lowwater, highwater)] + # Mock the current assignment with some values + assignment = [1, 2, 3] + consumer.assignment.return_value = assignment + + # Create Store and assign a StorePartition (which also sets up changelog topics) + store_partitions = {} + state_manager.register_store(topic_name=topic_name, store_name=store_name) + store = state_manager.get_store(topic=topic_name, store_name=store_name) + partition = store.assign_partition(partition_num) + store_partitions[store_name] = partition + + # Assign a RecoveryPartition + with patch.object( + RocksDBStorePartition, + "get_changelog_offset", + return_value=stored_changelog_offset, + ): + recovery_manager.assign_partition( + topic=topic_name, + partition=partition_num, + store_partitions=store_partitions, + committed_offset=-1001, + ) + + # Check the changelog topic partition is assigned to the consumer + assert consumer.incremental_assign.call_count == 1 + assigned_changelog_partitions = consumer.incremental_assign.call_args[0][0] + assert len(assigned_changelog_partitions) == 1 + + # Check the changelog topic partition properties + changelog_partition = assigned_changelog_partitions[0] + changelog_topic_name = topic_manager.changelog_topics[topic_name][ + store_name + ].name + assert changelog_partition.topic == changelog_topic_name + assert changelog_partition.partition == partition_num + assert changelog_partition.offset == stored_changelog_offset + + # Check that RecoveryPartition is assigned to RecoveryManager + assert len(recovery_manager.partitions[partition_num]) == 1 + + # Check that consumer paused all assigned partitions + consumer.pause.assert_called_with(assignment) + + def test_assign_partition_fix_offset_only( + self, + recovery_manager_factory, + recovery_partition_factory, + topic_manager_factory, + ): + """ + Try to recover store partition with changelog offset AHEAD of the watermark. + The offset should be adjusted in this case, but recovery should not be triggered + """ + + topic_name = "topic_name" + partition_num = 0 + store_name = "default" + consumer_group = "group" + lowwater, highwater = 0, 20 + + # Register a source topic and a changelog topic with one partition + topic_manager = topic_manager_factory() + topic_manager.topic(topic_name) + topic_manager.changelog_topic( + topic_name=topic_name, store_name=store_name, consumer_group=consumer_group + ) + + # Mock Consumer + consumer = MagicMock(spec_set=Consumer) + consumer.get_watermark_offsets.return_value = (lowwater, highwater) + consumer.assignment.return_value = "assignments" + + # Mock StorePartition + changelog_offset = 22 + store_partition = MagicMock(spec_set=RocksDBStorePartition) + store_partition.get_changelog_offset.return_value = changelog_offset + + recovery_manager = recovery_manager_factory( + consumer=consumer, topic_manager=topic_manager + ) + + with patch.object(RecoveryPartition, "update_offset") as update_offset: + recovery_manager.assign_partition( + topic=topic_name, + partition=partition_num, + store_partitions={store_name: store_partition}, + committed_offset=-1001, + ) + + # "update_offset()" should be called + update_offset.assert_called() + + # No pause or assignments should happen + consumer.pause.assert_not_called() + consumer.incremental_assign.assert_not_called() + + def test_assign_partitions_during_recovery( + self, + recovery_manager_factory, + recovery_partition_factory, + topic_manager_factory, + ): + """ + Check that RecoveryManager pauses only the source topic partition if + another partition is already recovering. + """ + + topic_name = "topic_name" + consumer_group = "group" + store_name = "default" + changelog_name = f"changelog__{consumer_group}--{topic_name}--{store_name}" + changelog_offset = 5 + lowwater, highwater = 0, 10 + assignment = [0, 1] + + # Register a source topic and a changelog topic with 2 partitions + topic_manager = topic_manager_factory() + topic_manager.topic( + topic_name, config=TopicConfig(num_partitions=2, replication_factor=1) + ) + topic_manager.changelog_topic( + topic_name=topic_name, store_name=store_name, consumer_group=consumer_group + ) + + # Create a RecoveryManager + consumer = MagicMock(spec_set=Consumer) + consumer.assignment.return_value = assignment + recovery_manager = recovery_manager_factory( + consumer=consumer, topic_manager=topic_manager + ) + + # Assign first partition that needs recovery + store_partition = MagicMock(spec_set=RocksDBStorePartition) + consumer.get_watermark_offsets.return_value = (lowwater, highwater) + store_partition.get_changelog_offset.return_value = changelog_offset + recovery_manager.assign_partition( + topic=topic_name, + partition=0, + committed_offset=-1001, + store_partitions={store_name: store_partition}, + ) + assert recovery_manager.partitions + assert recovery_manager.partitions[0][changelog_name].needs_recovery + + # Put a RecoveryManager into "recovering" state + recovery_manager._running = True + assert recovery_manager.recovering + + # Assign second partition that also needs recovery + store_partition = MagicMock(spec_set=RocksDBStorePartition) + store_partition.get_changelog_offset.return_value = 5 + recovery_manager.assign_partition( + topic=topic_name, + partition=1, + committed_offset=-1001, + store_partitions={store_name: store_partition}, + ) + assert recovery_manager.partitions + assert recovery_manager.partitions[1][changelog_name].needs_recovery + + # Check that consumer first paused all partitions + assert consumer.pause.call_args_list[0].args[0] == assignment + + # Check that consumer paused only source topic partition when the second + # recovery partition was assigned + assert consumer.pause.call_args_list[1].args[0] == [ + ConfluentPartition( + topic=topic_name, + partition=1, + offset=-1001, + ) + ] + + def test_revoke_partition(self, recovery_manager_factory, topic_manager_factory): + """ + Revoke a topic partition's respective recovery partitions. + """ + topic_name = "topic_name" + consumer_group = "group" + store_name = "default" + changelog_offset = 5 + lowwater, highwater = 0, 10 + assignment = [0, 1] + changelog_name = f"changelog__{consumer_group}--{topic_name}--{store_name}" + + # Register a source topic and a changelog topic with two partitions + topic_manager = topic_manager_factory() + topic_manager.topic( + topic_name, config=TopicConfig(num_partitions=2, replication_factor=1) + ) + topic_manager.changelog_topic( + topic_name=topic_name, store_name=store_name, consumer_group=consumer_group + ) + + # Create a RecoveryManager + consumer = MagicMock(spec_set=Consumer) + consumer.assignment.return_value = assignment + recovery_manager = recovery_manager_factory( + consumer=consumer, topic_manager=topic_manager + ) + + # Assign partitions that need recovery + store_partition = MagicMock(spec_set=RocksDBStorePartition) + consumer.get_watermark_offsets.return_value = (lowwater, highwater) + store_partition.get_changelog_offset.return_value = changelog_offset + recovery_manager.assign_partition( + topic=topic_name, + partition=0, + committed_offset=-1001, + store_partitions={store_name: store_partition}, + ) + recovery_manager.assign_partition( + topic=topic_name, + partition=1, + committed_offset=-1001, + store_partitions={store_name: store_partition}, + ) + assert len(recovery_manager.partitions) == 2 + + # Revoke one partition + recovery_manager.revoke_partition(0) + assert len(recovery_manager.partitions) == 1 + # Check that consumer unassigned the changelog topic partition as well + assert consumer.incremental_unassign.call_args.args[0] == [ + ConfluentPartition(topic=changelog_name, partition=0) + ] + + # Revoke second partition + recovery_manager.revoke_partition(1) + # Check that consumer unassigned the changelog topic partition as well + assert consumer.incremental_unassign.call_args.args[0] == [ + ConfluentPartition(topic=changelog_name, partition=1) + ] + # Check that no partitions are assigned + assert not recovery_manager.partitions + + def test_revoke_partition_no_partitions_assigned(self, recovery_manager_factory): + """ + Skip revoking any recovery partitions for a given partition since none are + currently assigned (due to not needing recovery). + """ + consumer = MagicMock(spec_set=Consumer) + recovery_manager = recovery_manager_factory(consumer=consumer) + recovery_manager.revoke_partition(partition_num=0) + assert not consumer.incremental_unassign.call_count + + def test_do_recovery( + self, recovery_manager_factory, topic_manager_factory, rocksdb_partition + ): + """ + Test that RecoveryManager.do_recovery(): + - resumes the recovering changelog partition + - applies changes to the StorePartition + - revokes the RecoveryPartition after recovery is done + - unassigns the changelog partition + - unpauses source topic partitions + """ + topic_name = "topic_name" + consumer_group = "group" + store_name = "default" + lowwater, highwater = 0, 10 + assignment = [0, 1] + changelog_name = f"changelog__{consumer_group}--{topic_name}--{store_name}" + + changelog_message = ConfluentKafkaMessageStub( + topic=changelog_name, + partition=0, + offset=highwater - 1, + key=b"key", + value=b"value", + headers=[(CHANGELOG_CF_MESSAGE_HEADER, b"default")], + ) + + # Register a source topic and a changelog topic with one partition + topic_manager = topic_manager_factory() + topic_manager.topic(topic_name) + topic_manager.changelog_topic( + topic_name=topic_name, store_name=store_name, consumer_group=consumer_group + ) + + # Create a RecoveryManager + consumer = MagicMock(spec_set=Consumer) + consumer.poll.return_value = changelog_message + consumer.assignment.return_value = assignment + recovery_manager = recovery_manager_factory( + consumer=consumer, topic_manager=topic_manager + ) + + # Assign a partition that needs recovery + consumer.get_watermark_offsets.return_value = (lowwater, highwater) + recovery_manager.assign_partition( + topic=topic_name, + partition=0, + committed_offset=-1001, + store_partitions={store_name: rocksdb_partition}, + ) + + # Trigger a recovery + recovery_manager.do_recovery() + + # Check that consumer first resumed the changelog topic partition + consumer_resume_calls = consumer.resume.call_args_list + assert consumer_resume_calls[0].args[0] == [ + ConfluentPartition(topic=changelog_name, partition=0) + ] + # Check that consumer resumed all assigned partitions after recovery is done + assert consumer_resume_calls[1].args[0] == assignment + + # Check that RecoveryPartitions are unassigned + assert not recovery_manager.partitions + + def test_do_recovery_no_partitions_assigned(self, recovery_manager_factory): + # Create a RecoveryManager + consumer = MagicMock(spec_set=Consumer) + recovery_manager = recovery_manager_factory(consumer=consumer) + # Trigger a recovery + recovery_manager.do_recovery() + + # Check that consumer.poll() is not called + assert not consumer.poll.called diff --git a/tests/test_quixstreams/test_state/test_recovery/test_recovery_partition.py b/tests/test_quixstreams/test_state/test_recovery/test_recovery_partition.py new file mode 100644 index 000000000..d2c1b67dc --- /dev/null +++ b/tests/test_quixstreams/test_state/test_recovery/test_recovery_partition.py @@ -0,0 +1,68 @@ +import logging +from unittest.mock import MagicMock + +from quixstreams.state.rocksdb import RocksDBStorePartition +from tests.utils import ConfluentKafkaMessageStub + + +class TestRecoveryPartition: + def test_set_watermarks(self, recovery_partition_factory): + recovery_partition = recovery_partition_factory() + recovery_partition.set_watermarks(50, 100) + assert recovery_partition.changelog_lowwater == 50 + assert recovery_partition.changelog_highwater == 100 + + def test_needs_recovery(self, recovery_partition_factory): + store_partition = MagicMock(RocksDBStorePartition) + store_partition.get_changelog_offset.return_value = 10 + + recovery_partition = recovery_partition_factory(store_partition=store_partition) + recovery_partition.set_watermarks(0, 20) + assert recovery_partition.needs_recovery + + def test_needs_recovery_caught_up(self, recovery_partition_factory): + store_partition = MagicMock(RocksDBStorePartition) + store_partition.get_changelog_offset.return_value = 10 + recovery_partition = recovery_partition_factory(store_partition=store_partition) + recovery_partition.set_watermarks(0, 20) + store_partition.get_changelog_offset.return_value = 20 + assert not recovery_partition.needs_recovery + + def test_needs_recovery_no_valid_offsets(self, recovery_partition_factory): + # Create a RecoveryPartition with the offset ahead of the watermark + store_partition = MagicMock(RocksDBStorePartition) + store_partition.get_changelog_offset.return_value = 101 + + recovery_partition = recovery_partition_factory(store_partition=store_partition) + recovery_partition.set_watermarks(100, 100) + assert not recovery_partition.needs_recovery + assert recovery_partition.needs_offset_update + + def test_recover_from_changelog_message(self, recovery_partition_factory): + store_partition = MagicMock(RocksDBStorePartition) + store_partition.get_changelog_offset.return_value = 10 + recovery_partition = recovery_partition_factory( + store_partition=store_partition, committed_offset=1 + ) + recovery_partition.set_watermarks(10, 20) + msg = ConfluentKafkaMessageStub() + recovery_partition.recover_from_changelog_message(msg) + + store_partition.recover_from_changelog_message.assert_called_with( + changelog_message=msg, committed_offset=1 + ) + + def test_update_offset(self, recovery_partition_factory, caplog): + store_partition = MagicMock(RocksDBStorePartition) + store_partition.get_changelog_offset.return_value = 10 + lowwater, highwater = 0, 9 + recovery_partition = recovery_partition_factory(store_partition=store_partition) + recovery_partition.set_watermarks(lowwater, highwater) + recovery_partition.update_offset() + + store_partition.set_changelog_offset.assert_called_with( + changelog_offset=highwater - 1 + ) + with caplog.at_level(level=logging.WARNING): + recovery_partition.update_offset() + assert caplog.text diff --git a/tests/test_quixstreams/test_state/test_rocksdb/test_partition.py b/tests/test_quixstreams/test_state/test_rocksdb/test_partition.py index 40fefbb69..c57e55a6d 100644 --- a/tests/test_quixstreams/test_state/test_rocksdb/test_partition.py +++ b/tests/test_quixstreams/test_state/test_rocksdb/test_partition.py @@ -17,7 +17,7 @@ PREFIX_SEPARATOR, ) from quixstreams.utils.json import dumps -from ...utils import ConfluentKafkaMessageStub +from tests.utils import ConfluentKafkaMessageStub class TestRocksDBStorePartition: @@ -174,7 +174,9 @@ def test_recover_from_changelog_message(self, rocksdb_partition, store_value): offset=50, ) - rocksdb_partition.recover_from_changelog_message(changelog_msg) + rocksdb_partition.recover_from_changelog_message( + changelog_msg, committed_offset=-1001 + ) with rocksdb_partition.begin() as tx: assert tx.get(user_store_key, prefix=kafka_key) == store_value @@ -197,5 +199,7 @@ def test_recover_from_changelog_message_cf_errors( offset=50, ) with pytest.raises(error): - rocksdb_partition.recover_from_changelog_message(changelog_msg) + rocksdb_partition.recover_from_changelog_message( + changelog_msg, committed_offset=-1001 + ) assert rocksdb_partition.get_changelog_offset() is None diff --git a/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/test_partition.py b/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/test_partition.py index 78ac3290e..47e4ce176 100644 --- a/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/test_partition.py +++ b/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/test_partition.py @@ -10,7 +10,7 @@ ) from quixstreams.state.rocksdb.windowed.serialization import encode_window_key from quixstreams.utils.json import dumps -from tests.test_quixstreams.utils import ConfluentKafkaMessageStub +from tests.utils import ConfluentKafkaMessageStub class TestWindowedRocksDBPartitionTransactionChangelog: @@ -37,7 +37,9 @@ def test_recover_window_from_changelog_message( offset=50, ) - store_partition.recover_from_changelog_message(changelog_msg) + store_partition.recover_from_changelog_message( + changelog_msg, committed_offset=-1001 + ) with store_partition.begin() as tx: assert ( tx.get_window(window["start_ms"], window["end_ms"], prefix=kafka_key) @@ -66,7 +68,9 @@ def test_recover_latest_expire_from_changelog_message( offset=50, ) - store_partition.recover_from_changelog_message(changelog_msg) + store_partition.recover_from_changelog_message( + changelog_msg, committed_offset=-1001 + ) with store_partition.begin() as tx: assert ( diff --git a/tests/test_quixstreams/utils.py b/tests/test_quixstreams/utils.py deleted file mode 100644 index 645106a5a..000000000 --- a/tests/test_quixstreams/utils.py +++ /dev/null @@ -1,63 +0,0 @@ -from typing import Optional, List, Tuple, Union - - -class ConfluentKafkaMessageStub: - """ - A stub object to mock `confluent_kafka.Message`. - - Instances of `confluent_kafka.Message` cannot be directly created from Python, - see https://github.com/confluentinc/confluent-kafka-python/issues/1535. - - """ - - def __init__( - self, - topic: str = "test", - partition: int = 0, - offset: int = 0, - timestamp: Tuple[int, int] = (1, 123), - key: bytes = None, - value: bytes = None, - headers: Optional[List[Tuple[str, bytes]]] = None, - latency: float = None, - leader_epoch: int = None, - ): - self._topic = topic - self._partition = partition - self._offset = offset - self._timestamp = timestamp - self._key = key - self._value = value - self._headers = headers - self._latency = latency - self._leader_epoch = leader_epoch - - def headers(self, *args, **kwargs) -> Optional[List[Tuple[str, bytes]]]: - return self._headers - - def key(self, *args, **kwargs) -> Optional[Union[str, bytes]]: - return self._key - - def offset(self, *args, **kwargs) -> int: - return self._offset - - def partition(self, *args, **kwargs) -> int: - return self._partition - - def timestamp(self, *args, **kwargs) -> (int, int): - return self._timestamp - - def topic(self, *args, **kwargs) -> str: - return self._topic - - def value(self, *args, **kwargs) -> Optional[Union[str, bytes]]: - return self._value - - def latency(self, *args, **kwargs) -> Optional[float]: - return self._latency - - def leader_epoch(self, *args, **kwargs) -> Optional[int]: - return self._leader_epoch - - def __len__(self) -> int: - return len(self._value) diff --git a/tests/utils.py b/tests/utils.py index a1ceeb476..786dcb9ba 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,6 +1,6 @@ import dataclasses import time - +from typing import Optional, List, Tuple, Union from confluent_kafka import OFFSET_INVALID DEFAULT_TIMEOUT = 10.0 @@ -31,3 +31,65 @@ class TopicPartitionStub: topic: str partition: int offset: int = OFFSET_INVALID + + +class ConfluentKafkaMessageStub: + """ + A stub object to mock `confluent_kafka.Message`. + + Instances of `confluent_kafka.Message` cannot be directly created from Python, + see https://github.com/confluentinc/confluent-kafka-python/issues/1535. + + """ + + def __init__( + self, + topic: str = "test", + partition: int = 0, + offset: int = 0, + timestamp: Tuple[int, int] = (1, 123), + key: bytes = None, + value: bytes = None, + headers: Optional[List[Tuple[str, bytes]]] = None, + latency: float = None, + leader_epoch: int = None, + ): + self._topic = topic + self._partition = partition + self._offset = offset + self._timestamp = timestamp + self._key = key + self._value = value + self._headers = headers + self._latency = latency + self._leader_epoch = leader_epoch + + def headers(self, *args, **kwargs) -> Optional[List[Tuple[str, bytes]]]: + return self._headers + + def key(self, *args, **kwargs) -> Optional[Union[str, bytes]]: + return self._key + + def offset(self, *args, **kwargs) -> int: + return self._offset + + def partition(self, *args, **kwargs) -> int: + return self._partition + + def timestamp(self, *args, **kwargs) -> (int, int): + return self._timestamp + + def topic(self, *args, **kwargs) -> str: + return self._topic + + def value(self, *args, **kwargs) -> Optional[Union[str, bytes]]: + return self._value + + def latency(self, *args, **kwargs) -> Optional[float]: + return self._latency + + def leader_epoch(self, *args, **kwargs) -> Optional[int]: + return self._leader_epoch + + def __len__(self) -> int: + return len(self._value) From d784d310d0a1d01610743cf9a73bb8406dffa6b5 Mon Sep 17 00:00:00 2001 From: Daniil Gusev Date: Tue, 30 Apr 2024 19:07:59 +0200 Subject: [PATCH 17/28] Remove ApplicationStatus enum --- quixstreams/app.py | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/quixstreams/app.py b/quixstreams/app.py index ec6b875a3..35ae9d313 100644 --- a/quixstreams/app.py +++ b/quixstreams/app.py @@ -1,5 +1,4 @@ import contextlib -import enum import functools import logging import os @@ -54,13 +53,6 @@ MessageProcessedCallback = Callable[[str, int, int], None] -class ApplicationStatus(enum.Enum): - CREATED = 1 - RUNNING = 2 - FAILED = 3 - STOPPED = 4 - - class Application: """ The main Application class. @@ -262,7 +254,8 @@ def __init__( self._on_processing_error = on_processing_error or default_on_processing_error self._on_message_processed = on_message_processed self._auto_create_topics = auto_create_topics - self._status: ApplicationStatus = ApplicationStatus.CREATED + self._running = False + self._failed = False if not topic_manager: topic_manager = topic_manager_factory( @@ -558,9 +551,15 @@ def stop(self, fail: bool = False): (like Kubernetes does) or perform a typical `KeyboardInterrupt` (`Ctrl+C`). :param fail: if True, signals that application is stopped due - to unhandled exception and it shouldn't commit the current checkpoint. + to unhandled exception, and it shouldn't commit the current checkpoint. """ - self._status = ApplicationStatus.FAILED if fail else ApplicationStatus.STOPPED + + self._running = False + if fail: + # Update "_failed" only when fail=True to prevent stop(failed=False) from + # resetting it + self._failed = True + if self._state_manager.using_changelogs: self._state_manager.stop_recovery() @@ -705,14 +704,14 @@ def run( ) logger.info("Waiting for incoming messages") # Start polling Kafka for messages and callbacks - self._status = ApplicationStatus.RUNNING + self._running = True # Initialize the checkpoint self._processing_context.init_checkpoint() dataframe_composed = dataframe.compose() - while self._status == ApplicationStatus.RUNNING: + while self._running: if self._state_manager.recovery_required: self._state_manager.do_recovery() else: @@ -842,10 +841,15 @@ def _on_revoke(self, _, topic_partitions: List[TopicPartition]): Revoke partitions from consumer and state """ # Commit everything processed so far unless the application is closing - # because of unhandled exception. + # because of the unhandled exception. # In this case, we should drop the checkpoint and let another consumer # pick up from the latest one - if not self._status == ApplicationStatus.FAILED: + if self._failed: + logger.warning( + "Application is stopping due to failure, " + "latest checkpoint will not be committed." + ) + else: self._processing_context.commit_checkpoint(force=True) self._consumer.incremental_unassign(topic_partitions) From 847dfaac0856975801b4619389d1af235667c969 Mon Sep 17 00:00:00 2001 From: Daniil Gusev Date: Tue, 30 Apr 2024 19:08:20 +0200 Subject: [PATCH 18/28] Implement consistent recovery --- quixstreams/checkpointing/checkpoint.py | 2 +- quixstreams/state/rocksdb/partition.py | 51 ++++++- tests/test_quixstreams/test_app.py | 132 +++++++++++++++++- .../test_state/test_rocksdb/test_partition.py | 86 +++++++++++- 4 files changed, 262 insertions(+), 9 deletions(-) diff --git a/quixstreams/checkpointing/checkpoint.py b/quixstreams/checkpointing/checkpoint.py index 9d16164e2..7d119297a 100644 --- a/quixstreams/checkpointing/checkpoint.py +++ b/quixstreams/checkpointing/checkpoint.py @@ -149,7 +149,7 @@ def commit(self): offset = self._tp_offsets[(topic, partition)] # Get the changelog topic-partition for the given transaction - # It can be None if changelog topics are disabled in the app + # It can be None if changelog topics are disabled in the app config changelog_tp = transaction.changelog_topic_partition # The changelog offset also can be None if no updates happened # during transaction diff --git a/quixstreams/state/rocksdb/partition.py b/quixstreams/state/rocksdb/partition.py index 124ce25a1..1df954c59 100644 --- a/quixstreams/state/rocksdb/partition.py +++ b/quixstreams/state/rocksdb/partition.py @@ -100,12 +100,48 @@ def _changelog_recover_flush(self, changelog_offset: int, batch: WriteBatch): ) self.write(batch) + def _should_skip_changelog( + self, headers: Dict[str, bytes], committed_offset: int + ) -> bool: + """ + Determine whether the changelog update should be skipped. + + :param headers: changelog message headers + :param committed_offset: latest committed offset of the source topic partition + :return: True if update should be skipped, else False. + """ + # Parse the processed topic-partition-offset info from the changelog message + # headers to determine whether the update should be applied or skipped. + # It can be empty if the message was produced by the older version of the lib. + processed_offset_header = headers.get( + CHANGELOG_PROCESSED_OFFSET_MESSAGE_HEADER, b"[]" + ) + processed_offset_data = json_loads(processed_offset_header) + if processed_offset_data: + # Skip recovering from the message if its processed offset is ahead of the + # current committed offset. + # This way it will recover to a consistent state if the checkpointing code + # produced the changelog messages but failed to commit + # the source topic offset. + _, _, processed_offset = processed_offset_data + return processed_offset >= committed_offset + return False + def recover_from_changelog_message( self, changelog_message: ConfluentKafkaMessageProto, committed_offset: int ): """ Updates state from a given changelog message. + The actual update may be skipped when both conditions are met: + + - The changelog message has headers with the processed message offset. + - This processed offset is larger than the latest committed offset for the same + topic partition. + + This way the state does not apply the state changes for not-yet-committed + messages and improves the state consistency guarantees. + :param changelog_message: A raw Confluent message read from a changelog topic. :param committed_offset: latest committed offset for the partition """ @@ -119,11 +155,16 @@ def recover_from_changelog_message( cf_handle = self.get_column_family_handle(cf_name) batch = WriteBatch(raw_mode=True) - key = changelog_message.key() - if value := changelog_message.value(): - batch.put(key, value, cf_handle) - else: - batch.delete(key, cf_handle) + # Determine whether the update should be applied or skipped based on the + # latest committed offset and processed offset from the changelog message header + if not self._should_skip_changelog( + headers=headers, committed_offset=committed_offset + ): + key = changelog_message.key() + if value := changelog_message.value(): + batch.put(key, value, cf_handle) + else: + batch.delete(key, cf_handle) self._changelog_recover_flush(changelog_message.offset(), batch) diff --git a/tests/test_quixstreams/test_app.py b/tests/test_quixstreams/test_app.py index 7846117e7..17bafb91f 100644 --- a/tests/test_quixstreams/test_app.py +++ b/tests/test_quixstreams/test_app.py @@ -1,3 +1,4 @@ +import contextlib import logging import os import time @@ -12,6 +13,7 @@ from quixstreams.app import Application from quixstreams.dataframe import StreamingDataFrame from quixstreams.dataframe.windows.base import get_window_ranges +from quixstreams.exceptions import PartitionAssignmentError from quixstreams.kafka.exceptions import KafkaConsumerException from quixstreams.models import ( DoubleDeserializer, @@ -210,7 +212,7 @@ def count_and_fail(_): failed = Future() # Stop app when the future is resolved - executor.submit(_stop_app_on_future, app, failed, 15.0) + executor.submit(_stop_app_on_future, app, failed, 10.0) with pytest.raises(ValueError): app.run(sdf) @@ -1453,3 +1455,131 @@ def validate_state(): assert processed_count == {0: 0, 1: 0} # State should be the same as before deletion validate_state() + + def test_changelog_recovery_consistent_after_failed_commit( + self, app_factory, executor, tmp_path, state_manager_factory, consumer_factory + ): + """ + Scenario: application processes messages and successfully produces changelog + messages but fails to commit the topic offsets. + + We expect that the app will be recovered to a consistent state and changes + for the yet uncommitted messages will not be applied. + """ + consumer_group = str(uuid.uuid4()) + state_dir = (tmp_path / "state").absolute() + topic_name = str(uuid.uuid4()) + store_name = "default" + + # Messages to be processed successfully + succeeded_messages = [ + ("key1", "1"), + ("key2", "2"), + ("key3", "3"), + ] + # Messages to fail + failed_messages = [ + ("key1", "4"), + ("key2", "5"), + ("key3", "6"), + ] + # Ensure the same number of messages in both sets to simplift testing + assert len(failed_messages) == len(succeeded_messages) + total_count = len(succeeded_messages) + processed_count = 0 + + def on_message_processed(topic_, partition, offset): + nonlocal processed_count + # Set the callback to track total messages processed + # The callback is not triggered if processing fails + processed_count += 1 + if processed_count == total_count: + done.set_result(True) + + def get_app(): + app = app_factory( + commit_interval=999, # Simulate a very long commit interval + auto_offset_reset="earliest", + use_changelog_topics=True, + on_message_processed=on_message_processed, + consumer_group=consumer_group, + state_dir=state_dir, + ) + topic = app.topic(topic_name) + sdf = app.dataframe(topic) + sdf = sdf.update( + lambda value, state: state.set("latest", value["number"]), stateful=True + ) + return app, sdf, topic + + def validate_state(): + with state_manager_factory( + group_id=consumer_group, + state_dir=state_dir, + ) as state_manager, consumer_factory( + consumer_group=consumer_group + ) as consumer: + committed_offset = consumer.committed( + [TopicPartition(topic=topic_name, partition=0)] + )[0].offset + state_manager.register_store(topic.name, store_name) + partition = state_manager.on_partition_assign( + topic=topic.name, partition=0, committed_offset=committed_offset + )[0] + with partition.begin() as tx: + for key, value in succeeded_messages: + state = tx.as_state(prefix=key.encode()) + assert state.get("latest") == value + + # Produce messages from the "succeded" set + app, sdf, topic = get_app() + with app.get_producer() as producer: + for key, value in succeeded_messages: + serialized = topic.serialize(key=key.encode(), value={"number": value}) + producer.produce(topic.name, key=serialized.key, value=serialized.value) + + # Run the application to apply changes to state + done = Future() + executor.submit(_stop_app_on_future, app, done, 10.0) + app.run(sdf) + assert processed_count == total_count + # Validate the state + validate_state() + + # Init application again + processed_count = 0 + app, sdf, topic = get_app() + + # Produce messages from the "failed" set + with app.get_producer() as producer: + for key, value in failed_messages: + serialized = topic.serialize(key=key.encode(), value={"number": value}) + producer.produce(topic.name, key=serialized.key, value=serialized.value) + + # Run the app second time and fail the consumer commit + with patch.object( + RowConsumer, "commit", side_effect=ValueError("commit failed") + ): + done = Future() + executor.submit(_stop_app_on_future, app, done, 10.0) + with contextlib.suppress(PartitionAssignmentError): + app.run(sdf) + + validate_state() + + # Run the app again to recover the state + app, sdf, topic = get_app() + # Clear the state to recover from scratch + app.clear_state() + + # Run app for the third time and fail on commit to prevent state changes + with patch.object( + RowConsumer, "commit", side_effect=ValueError("commit failed") + ): + done = Future() + executor.submit(_stop_app_on_future, app, done, 10.0) + with contextlib.suppress(PartitionAssignmentError): + app.run(sdf) + + # The app should be recovered + validate_state() diff --git a/tests/test_quixstreams/test_state/test_rocksdb/test_partition.py b/tests/test_quixstreams/test_state/test_rocksdb/test_partition.py index c57e55a6d..90ed12d60 100644 --- a/tests/test_quixstreams/test_state/test_rocksdb/test_partition.py +++ b/tests/test_quixstreams/test_state/test_rocksdb/test_partition.py @@ -15,6 +15,7 @@ from quixstreams.state.rocksdb.metadata import ( CHANGELOG_CF_MESSAGE_HEADER, PREFIX_SEPARATOR, + CHANGELOG_PROCESSED_OFFSET_MESSAGE_HEADER, ) from quixstreams.utils.json import dumps from tests.utils import ConfluentKafkaMessageStub @@ -161,7 +162,9 @@ def test_ensure_metadata_cf(self, rocksdb_partition): class TestRocksDBStorePartitionChangelog: @pytest.mark.parametrize("store_value", [10, None]) - def test_recover_from_changelog_message(self, rocksdb_partition, store_value): + def test_recover_from_changelog_message_no_processed_offset( + self, rocksdb_partition, store_value + ): """ Tests both a put (10) and delete (None) """ @@ -189,7 +192,7 @@ def test_recover_from_changelog_message(self, rocksdb_partition, store_value): ([], ColumnFamilyHeaderMissing), ], ) - def test_recover_from_changelog_message_cf_errors( + def test_recover_from_changelog_message_missing_cf_headers( self, rocksdb_partition, headers, error ): changelog_msg = ConfluentKafkaMessageStub( @@ -203,3 +206,82 @@ def test_recover_from_changelog_message_cf_errors( changelog_msg, committed_offset=-1001 ) assert rocksdb_partition.get_changelog_offset() is None + + def test_recover_from_changelog_message_with_processed_offset_behind_committed( + self, rocksdb_partition + ): + """ + Test that changes from the changelog topic are applied if the + source topic offset header is present and is smaller than the latest committed + offset. + """ + kafka_key = b"my_key" + user_store_key = "count" + + processed_offset_header = ( + CHANGELOG_PROCESSED_OFFSET_MESSAGE_HEADER, + dumps(["topic", 0, 1]), + ) + committted_offset = 2 + changelog_msg = ConfluentKafkaMessageStub( + key=kafka_key + PREFIX_SEPARATOR + dumps(user_store_key), + value=dumps(10), + headers=[ + (CHANGELOG_CF_MESSAGE_HEADER, b"default"), + processed_offset_header, + ], + ) + + rocksdb_partition.recover_from_changelog_message( + changelog_msg, committed_offset=committted_offset + ) + + with rocksdb_partition.begin() as tx: + assert tx.get(user_store_key, prefix=kafka_key) == 10 + assert rocksdb_partition.get_changelog_offset() == changelog_msg.offset() + 1 + + def test_recover_from_changelog_message_with_processed_offset_ahead_committed( + self, rocksdb_partition + ): + """ + Test that changes from the changelog topic are NOT applied if the + source topic offset header is present but larger than the latest committed + offset. + It means that the changelog messages were produced during the checkpoint, + but the topic offset was not committed. + Possible reasons: + - Producer couldn't verify the delivery of every changelog message + - Consumer failed to commit the source topic offsets + """ + kafka_key = b"my_key" + user_store_key = "count" + # Processed offset should be strictly lower than committed offset for + # the change to be applied + processed_offset = 2 + committed_offset = 2 + + # Generate the changelog message with processed offset ahead of the committed + # one + processed_offset_header = ( + CHANGELOG_PROCESSED_OFFSET_MESSAGE_HEADER, + dumps(["topic", 0, processed_offset]), + ) + changelog_msg = ConfluentKafkaMessageStub( + key=kafka_key + PREFIX_SEPARATOR + dumps(user_store_key), + value=dumps(10), + headers=[ + (CHANGELOG_CF_MESSAGE_HEADER, b"default"), + processed_offset_header, + ], + ) + + # Recover from the message + rocksdb_partition.recover_from_changelog_message( + changelog_msg, committed_offset=committed_offset + ) + + # Check that the changes have not been applied, but the changelog offset + # increased + with rocksdb_partition.begin() as tx: + assert tx.get(user_store_key, prefix=kafka_key) is None + assert rocksdb_partition.get_changelog_offset() == changelog_msg.offset() + 1 From d8e95e480d5e5fe1ca63ef5bb67e655a021a772d Mon Sep 17 00:00:00 2001 From: Daniil Gusev Date: Wed, 1 May 2024 13:08:39 +0200 Subject: [PATCH 19/28] Enable idempotence for internal RowProducer --- quixstreams/app.py | 16 ++++++++-- tests/test_quixstreams/test_app.py | 48 +++++++++++++++++++++--------- 2 files changed, 48 insertions(+), 16 deletions(-) diff --git a/quixstreams/app.py b/quixstreams/app.py index 35ae9d313..3a0c74e77 100644 --- a/quixstreams/app.py +++ b/quixstreams/app.py @@ -52,6 +52,9 @@ logger = logging.getLogger(__name__) MessageProcessedCallback = Callable[[str, int, int], None] +# Enforce idempotent producing for the internal RowProducer +_default_producer_extra_config = {"enable.idempotence": True} + class Application: """ @@ -176,6 +179,15 @@ def __init__( > NOTE: It is recommended to just use `quix_sdk_token` instead. """ configure_logging(loglevel=loglevel) + producer_extra_config = producer_extra_config or {} + consumer_extra_config = consumer_extra_config or {} + + # Add default values to the producer config, but allow them to be overwritten + # by the provided producer_extra_config dict + producer_extra_config = { + **_default_producer_extra_config, + **producer_extra_config, + } # We can't use os.getenv as defaults (and have testing work nicely) # since it evaluates getenv when the function is defined. @@ -218,8 +230,8 @@ def __init__( broker_address = quix_configs.pop("bootstrap.servers") # Quix Cloud prefixes consumer group with workspace id consumer_group = quix_config_builder.prepend_workspace_id(consumer_group) - consumer_extra_config = {**quix_configs, **(consumer_extra_config or {})} - producer_extra_config = {**quix_configs, **(producer_extra_config or {})} + consumer_extra_config = {**quix_configs, **consumer_extra_config} + producer_extra_config = {**quix_configs, **producer_extra_config} else: # Only broker address is provided topic_manager_factory = TopicManager diff --git a/tests/test_quixstreams/test_app.py b/tests/test_quixstreams/test_app.py index 17bafb91f..ee9ec4e2f 100644 --- a/tests/test_quixstreams/test_app.py +++ b/tests/test_quixstreams/test_app.py @@ -491,11 +491,11 @@ def test_producer_extra_config(self, app_factory): Test that producer receives the Application extra configs """ app = app_factory( - producer_extra_config={"max.in.flight": "123"}, + producer_extra_config={"linger.ms": 10}, ) with app.get_producer() as x: - assert x._producer_config["max.in.flight"] is "123" + assert x._producer_config["linger.ms"] == 10 def test_missing_broker_id_raise(self): # confirm environment is empty @@ -544,7 +544,12 @@ def test_init_with_quix_sdk_token_arg(self): **auth_params, "bootstrap.servers": broker_address, } - expected_extra_config = {**auth_params, **extra_config} + expected_producer_extra_config = { + "enable.idempotence": True, + **auth_params, + **extra_config, + } + expected_consumer_extra_config = {**auth_params, **extra_config} def get_cfg_builder(quix_sdk_token): cfg_builder = create_autospec(QuixKafkaConfigsBuilder) @@ -572,12 +577,12 @@ def get_cfg_builder(quix_sdk_token): # to the low-level configs of producer and consumer producer_call_kwargs = producer_init_mock.call_args.kwargs assert producer_call_kwargs["broker_address"] == broker_address - assert producer_call_kwargs["extra_config"] == expected_extra_config + assert producer_call_kwargs["extra_config"] == expected_producer_extra_config consumer_call_kwargs = consumer_init_mock.call_args.kwargs assert consumer_call_kwargs["broker_address"] == broker_address assert consumer_call_kwargs["consumer_group"] == expected_workspace_cgroup - assert consumer_call_kwargs["extra_config"] == expected_extra_config + assert consumer_call_kwargs["extra_config"] == expected_consumer_extra_config def test_init_with_quix_sdk_token_env(self, monkeypatch): consumer_group = "c_group" @@ -597,7 +602,12 @@ def test_init_with_quix_sdk_token_env(self, monkeypatch): **auth_params, "bootstrap.servers": broker_address, } - expected_extra_config = {**auth_params, **extra_config} + expected_producer_extra_config = { + "enable.idempotence": True, + **auth_params, + **extra_config, + } + expected_consumer_extra_config = {**auth_params, **extra_config} def get_cfg_builder(quix_sdk_token): cfg_builder = create_autospec(QuixKafkaConfigsBuilder) @@ -624,12 +634,12 @@ def get_cfg_builder(quix_sdk_token): # to the low-level configs of producer and consumer producer_call_kwargs = producer_init_mock.call_args.kwargs assert producer_call_kwargs["broker_address"] == broker_address - assert producer_call_kwargs["extra_config"] == expected_extra_config + assert producer_call_kwargs["extra_config"] == expected_producer_extra_config consumer_call_kwargs = consumer_init_mock.call_args.kwargs assert consumer_call_kwargs["broker_address"] == broker_address assert consumer_call_kwargs["consumer_group"] == expected_workspace_cgroup - assert consumer_call_kwargs["extra_config"] == expected_extra_config + assert consumer_call_kwargs["extra_config"] == expected_consumer_extra_config def test_init_with_quix_config_builder(self): consumer_group = "c_group" @@ -649,7 +659,12 @@ def test_init_with_quix_config_builder(self): **auth_params, "bootstrap.servers": broker_address, } - expected_extra_config = {**auth_params, **extra_config} + expected_producer_extra_config = { + "enable.idempotence": True, + **auth_params, + **extra_config, + } + expected_consumer_extra_config = {**auth_params, **extra_config} def get_cfg_builder(quix_sdk_token): cfg_builder = create_autospec(QuixKafkaConfigsBuilder) @@ -674,12 +689,12 @@ def get_cfg_builder(quix_sdk_token): # to the low-level configs of producer and consumer producer_call_kwargs = producer_init_mock.call_args.kwargs assert producer_call_kwargs["broker_address"] == broker_address - assert producer_call_kwargs["extra_config"] == expected_extra_config + assert producer_call_kwargs["extra_config"] == expected_producer_extra_config consumer_call_kwargs = consumer_init_mock.call_args.kwargs assert consumer_call_kwargs["broker_address"] == broker_address assert consumer_call_kwargs["consumer_group"] == expected_workspace_cgroup - assert consumer_call_kwargs["extra_config"] == expected_extra_config + assert consumer_call_kwargs["extra_config"] == expected_consumer_extra_config def test_init_with_broker_id_raises(self): with pytest.raises(ValueError) as e_info: @@ -788,7 +803,12 @@ def test_init(self): **auth_params, "bootstrap.servers": broker_address, } - expected_extra_config = {**auth_params, **extra_config} + expected_producer_extra_config = { + "enable.idempotence": True, + **auth_params, + **extra_config, + } + expected_consumer_extra_config = {**auth_params, **extra_config} cfg_builder = create_autospec(QuixKafkaConfigsBuilder) cfg_builder.get_confluent_broker_config.return_value = confluent_broker_config @@ -808,12 +828,12 @@ def test_init(self): # to the low-level configs of producer and consumer producer_call_kwargs = producer_init_mock.call_args.kwargs assert producer_call_kwargs["broker_address"] == broker_address - assert producer_call_kwargs["extra_config"] == expected_extra_config + assert producer_call_kwargs["extra_config"] == expected_producer_extra_config consumer_call_kwargs = consumer_init_mock.call_args.kwargs assert consumer_call_kwargs["broker_address"] == broker_address assert consumer_call_kwargs["consumer_group"] == expected_workspace_cgroup - assert consumer_call_kwargs["extra_config"] == expected_extra_config + assert consumer_call_kwargs["extra_config"] == expected_consumer_extra_config cfg_builder.prepend_workspace_id.assert_called_with("c_group") From db2523e193677e0485b255e333db9ae3ef9fd25b Mon Sep 17 00:00:00 2001 From: Daniil Gusev Date: Thu, 2 May 2024 17:21:10 +0200 Subject: [PATCH 20/28] Remove topic and partition values from the changelog messages --- quixstreams/state/rocksdb/partition.py | 8 ++--- quixstreams/state/rocksdb/transaction.py | 4 +-- .../test_state/test_rocksdb/test_partition.py | 4 +-- .../test_rocksdb/test_transaction.py | 36 ++----------------- .../test_windowed/test_transaction.py | 24 ++----------- 5 files changed, 11 insertions(+), 65 deletions(-) diff --git a/quixstreams/state/rocksdb/partition.py b/quixstreams/state/rocksdb/partition.py index 1df954c59..27d1a4540 100644 --- a/quixstreams/state/rocksdb/partition.py +++ b/quixstreams/state/rocksdb/partition.py @@ -34,7 +34,6 @@ __all__ = ("RocksDBStorePartition",) - logger = logging.getLogger(__name__) @@ -114,16 +113,15 @@ def _should_skip_changelog( # headers to determine whether the update should be applied or skipped. # It can be empty if the message was produced by the older version of the lib. processed_offset_header = headers.get( - CHANGELOG_PROCESSED_OFFSET_MESSAGE_HEADER, b"[]" + CHANGELOG_PROCESSED_OFFSET_MESSAGE_HEADER, b"null" ) - processed_offset_data = json_loads(processed_offset_header) - if processed_offset_data: + processed_offset = json_loads(processed_offset_header) + if processed_offset is not None: # Skip recovering from the message if its processed offset is ahead of the # current committed offset. # This way it will recover to a consistent state if the checkpointing code # produced the changelog messages but failed to commit # the source topic offset. - _, _, processed_offset = processed_offset_data return processed_offset >= committed_offset return False diff --git a/quixstreams/state/rocksdb/transaction.py b/quixstreams/state/rocksdb/transaction.py index 499795101..26c546a42 100644 --- a/quixstreams/state/rocksdb/transaction.py +++ b/quixstreams/state/rocksdb/transaction.py @@ -381,9 +381,7 @@ def _produce_changelog(self, processed_offset: Optional[int] = None): ) # Iterate over the transaction update cache for cf_name, cf_update_cache in self._update_cache.items(): - source_tp_offset_header = json_dumps( - [source_topic, partition, processed_offset] - ) + source_tp_offset_header = json_dumps(processed_offset) headers = { CHANGELOG_CF_MESSAGE_HEADER: cf_name, CHANGELOG_PROCESSED_OFFSET_MESSAGE_HEADER: source_tp_offset_header, diff --git a/tests/test_quixstreams/test_state/test_rocksdb/test_partition.py b/tests/test_quixstreams/test_state/test_rocksdb/test_partition.py index 90ed12d60..2531414e8 100644 --- a/tests/test_quixstreams/test_state/test_rocksdb/test_partition.py +++ b/tests/test_quixstreams/test_state/test_rocksdb/test_partition.py @@ -220,7 +220,7 @@ def test_recover_from_changelog_message_with_processed_offset_behind_committed( processed_offset_header = ( CHANGELOG_PROCESSED_OFFSET_MESSAGE_HEADER, - dumps(["topic", 0, 1]), + dumps(1), ) committted_offset = 2 changelog_msg = ConfluentKafkaMessageStub( @@ -264,7 +264,7 @@ def test_recover_from_changelog_message_with_processed_offset_ahead_committed( # one processed_offset_header = ( CHANGELOG_PROCESSED_OFFSET_MESSAGE_HEADER, - dumps(["topic", 0, processed_offset]), + dumps(processed_offset), ) changelog_msg = ConfluentKafkaMessageStub( key=kafka_key + PREFIX_SEPARATOR + dumps(user_store_key), diff --git a/tests/test_quixstreams/test_state/test_rocksdb/test_transaction.py b/tests/test_quixstreams/test_state/test_rocksdb/test_transaction.py index d9adcb9c7..f8099e486 100644 --- a/tests/test_quixstreams/test_state/test_rocksdb/test_transaction.py +++ b/tests/test_quixstreams/test_state/test_rocksdb/test_transaction.py @@ -448,10 +448,6 @@ def test_set_and_prepare(self, rocksdb_partition_factory, changelog_producer_moc ] cf = "default" prefix = b"__key__" - source_topic_name, source_partition = ( - changelog_producer_mock.source_topic_name, - changelog_producer_mock.partition, - ) processed_offset = 1 with rocksdb_partition_factory( @@ -476,13 +472,7 @@ def test_set_and_prepare(self, rocksdb_partition_factory, changelog_producer_moc assert call.kwargs["value"] == tx._serialize_value(value=value) assert call.kwargs["headers"] == { CHANGELOG_CF_MESSAGE_HEADER: cf, - CHANGELOG_PROCESSED_OFFSET_MESSAGE_HEADER: dumps( - [ - source_topic_name, - source_partition, - processed_offset, - ] - ), + CHANGELOG_PROCESSED_OFFSET_MESSAGE_HEADER: dumps(processed_offset), } assert tx.prepared @@ -493,10 +483,6 @@ def test_delete_and_prepare( key, value = "key", "value" cf = "default" prefix = b"__key__" - source_topic_name, source_partition = ( - changelog_producer_mock.source_topic_name, - changelog_producer_mock.partition, - ) processed_offset = 1 with rocksdb_partition_factory( @@ -518,13 +504,7 @@ def test_delete_and_prepare( assert delete_changelog.kwargs["value"] is None assert delete_changelog.kwargs["headers"] == { CHANGELOG_CF_MESSAGE_HEADER: cf, - CHANGELOG_PROCESSED_OFFSET_MESSAGE_HEADER: dumps( - [ - source_topic_name, - source_partition, - processed_offset, - ] - ), + CHANGELOG_PROCESSED_OFFSET_MESSAGE_HEADER: dumps(processed_offset), } def test_set_delete_and_prepare( @@ -537,10 +517,6 @@ def test_set_delete_and_prepare( key, value = "key", "value" cf = "default" prefix = b"__key__" - source_topic_name, source_partition = ( - changelog_producer_mock.source_topic_name, - changelog_producer_mock.partition, - ) processed_offset = 1 with rocksdb_partition_factory( @@ -561,11 +537,5 @@ def test_set_delete_and_prepare( assert delete_changelog.kwargs["value"] is None assert delete_changelog.kwargs["headers"] == { CHANGELOG_CF_MESSAGE_HEADER: cf, - CHANGELOG_PROCESSED_OFFSET_MESSAGE_HEADER: dumps( - [ - source_topic_name, - source_partition, - processed_offset, - ] - ), + CHANGELOG_PROCESSED_OFFSET_MESSAGE_HEADER: dumps(processed_offset), } diff --git a/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/test_transaction.py b/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/test_transaction.py index 83d6ca3a2..20299a116 100644 --- a/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/test_transaction.py +++ b/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/test_transaction.py @@ -321,10 +321,6 @@ def test_update_window_and_prepare( start_ms = 0 end_ms = 10 value = 1 - source_topic_name, source_partition = ( - changelog_producer_mock.source_topic_name, - changelog_producer_mock.partition, - ) processed_offset = 1 with windowed_rocksdb_partition_factory( @@ -351,13 +347,7 @@ def test_update_window_and_prepare( value=expected_produced_value, headers={ CHANGELOG_CF_MESSAGE_HEADER: "default", - CHANGELOG_PROCESSED_OFFSET_MESSAGE_HEADER: dumps( - [ - source_topic_name, - source_partition, - processed_offset, - ] - ), + CHANGELOG_PROCESSED_OFFSET_MESSAGE_HEADER: dumps(processed_offset), }, ) @@ -367,10 +357,6 @@ def test_delete_window_and_prepare( prefix = b"__key__" start_ms = 0 end_ms = 10 - source_topic_name, source_partition = ( - changelog_producer_mock.source_topic_name, - changelog_producer_mock.partition, - ) processed_offset = 1 with windowed_rocksdb_partition_factory( @@ -391,12 +377,6 @@ def test_delete_window_and_prepare( value=None, headers={ CHANGELOG_CF_MESSAGE_HEADER: "default", - CHANGELOG_PROCESSED_OFFSET_MESSAGE_HEADER: dumps( - [ - source_topic_name, - source_partition, - processed_offset, - ] - ), + CHANGELOG_PROCESSED_OFFSET_MESSAGE_HEADER: dumps(processed_offset), }, ) From b328a7de83945fef6eb22832d6620d1e0cf748ea Mon Sep 17 00:00:00 2001 From: Daniil Gusev Date: Tue, 7 May 2024 16:23:35 +0200 Subject: [PATCH 21/28] Update Checkpoint.commit docstring --- quixstreams/checkpointing/checkpoint.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/quixstreams/checkpointing/checkpoint.py b/quixstreams/checkpointing/checkpoint.py index 7d119297a..e437e3b17 100644 --- a/quixstreams/checkpointing/checkpoint.py +++ b/quixstreams/checkpointing/checkpoint.py @@ -103,9 +103,10 @@ def commit(self): Commit the checkpoint. This method will: - 1. Flush the changelogs for each state store and ensure everything is produced. - 2. Commit topic offsets. - 3. Flush each state store partition to the disk. + 1. Produce the changelogs for each state store + 2. Flush the producer to ensure everything is delivered. + 3. Commit topic offsets. + 4. Flush each state store partition to the disk. """ if not self._tp_offsets: From c3723e98292d2c9a44228cefd5431f459890ad20 Mon Sep 17 00:00:00 2001 From: Daniil Gusev Date: Tue, 7 May 2024 16:30:57 +0200 Subject: [PATCH 22/28] Add commit_interval to Application docstring --- quixstreams/app.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/quixstreams/app.py b/quixstreams/app.py index 3a0c74e77..0dc825598 100644 --- a/quixstreams/app.py +++ b/quixstreams/app.py @@ -134,8 +134,9 @@ def __init__( Linked Environment Variable: `Quix__Consumer__Group`. Default - "quixstreams-default" (set during init) >***NOTE:*** Quix Applications will prefix it with the Quix workspace id. + :param commit_interval: How often to commit the processed messages in seconds. + Default - 5.0. :param auto_offset_reset: Consumer `auto.offset.reset` setting - :param partitioner: A function to be used to determine the outgoing message partition. :param consumer_extra_config: A dictionary with additional options that From 21918f0c51a8d2e28a45f755e685f8aad78a3738 Mon Sep 17 00:00:00 2001 From: Daniil Gusev <133032822+daniil-quix@users.noreply.github.com> Date: Tue, 7 May 2024 16:45:31 +0200 Subject: [PATCH 23/28] Update quixstreams/state/rocksdb/transaction.py Fix typo Co-authored-by: Tim Sawicki <136370015+tim-quix@users.noreply.github.com> --- quixstreams/state/rocksdb/transaction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/quixstreams/state/rocksdb/transaction.py b/quixstreams/state/rocksdb/transaction.py index 26c546a42..c2e2a3767 100644 --- a/quixstreams/state/rocksdb/transaction.py +++ b/quixstreams/state/rocksdb/transaction.py @@ -235,7 +235,7 @@ def exists(self, key: Any, prefix: bytes, cf_name: str = "default") -> bool: def prepare(self, processed_offset: int): """ Produce changelog messages to the changelog topic for all changes accumulated - in this transaction and prepare transcation to flush its state to the state + in this transaction and prepare transaction to flush its state to the state store. After successful `prepare()`, the transaction status is changed to PREPARED, From 8873f788fc19968f4891451450b65ae117b7fab8 Mon Sep 17 00:00:00 2001 From: Daniil Gusev <133032822+daniil-quix@users.noreply.github.com> Date: Tue, 7 May 2024 16:45:50 +0200 Subject: [PATCH 24/28] Update quixstreams/app.py Fix typo Co-authored-by: Tim Sawicki <136370015+tim-quix@users.noreply.github.com> --- quixstreams/app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/quixstreams/app.py b/quixstreams/app.py index 0dc825598..14b6fdb46 100644 --- a/quixstreams/app.py +++ b/quixstreams/app.py @@ -817,7 +817,7 @@ def _on_assign(self, _, topic_partitions: List[TopicPartition]): if self._state_manager.stores: logger.debug(f"Rebalancing: assigning state store partitions") for tp in topic_partitions: - # Get the latest committed offset for the assgined topic partition + # Get the latest committed offset for the assigned topic partition tp_committed = self._consumer.committed([tp], timeout=30)[0] # Assign store partitions store_partitions = self._state_manager.on_partition_assign( From 0b2b5eee47fe55d7bbd1db754581dba1d6d83ca4 Mon Sep 17 00:00:00 2001 From: Daniil Gusev <133032822+daniil-quix@users.noreply.github.com> Date: Tue, 7 May 2024 16:46:00 +0200 Subject: [PATCH 25/28] Update quixstreams/state/rocksdb/transaction.py Fix typo Co-authored-by: Tim Sawicki <136370015+tim-quix@users.noreply.github.com> --- quixstreams/state/rocksdb/transaction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/quixstreams/state/rocksdb/transaction.py b/quixstreams/state/rocksdb/transaction.py index c2e2a3767..03bdccac8 100644 --- a/quixstreams/state/rocksdb/transaction.py +++ b/quixstreams/state/rocksdb/transaction.py @@ -309,7 +309,7 @@ def prepared(self) -> bool: """ Check if the transaction is in PREPARED status. - Prepared transaction successefully flushed its changelog and cannot receive + Prepared transaction successfully flushed its changelog and cannot receive updates anymore, but its state is not yet flushed to the disk :return: `True` if transaction is prepared, `False` otherwise. From 2b225153f690822dc0f8b23318cd72197616b925 Mon Sep 17 00:00:00 2001 From: Daniil Gusev Date: Tue, 7 May 2024 16:52:21 +0200 Subject: [PATCH 26/28] Rename _should_skip_changelog -> _should_apply_changelog --- quixstreams/state/rocksdb/partition.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/quixstreams/state/rocksdb/partition.py b/quixstreams/state/rocksdb/partition.py index 27d1a4540..546500c78 100644 --- a/quixstreams/state/rocksdb/partition.py +++ b/quixstreams/state/rocksdb/partition.py @@ -99,7 +99,7 @@ def _changelog_recover_flush(self, changelog_offset: int, batch: WriteBatch): ) self.write(batch) - def _should_skip_changelog( + def _should_apply_changelog( self, headers: Dict[str, bytes], committed_offset: int ) -> bool: """ @@ -107,7 +107,7 @@ def _should_skip_changelog( :param headers: changelog message headers :param committed_offset: latest committed offset of the source topic partition - :return: True if update should be skipped, else False. + :return: True if update should be applied, else False. """ # Parse the processed topic-partition-offset info from the changelog message # headers to determine whether the update should be applied or skipped. @@ -122,8 +122,8 @@ def _should_skip_changelog( # This way it will recover to a consistent state if the checkpointing code # produced the changelog messages but failed to commit # the source topic offset. - return processed_offset >= committed_offset - return False + return processed_offset < committed_offset + return True def recover_from_changelog_message( self, changelog_message: ConfluentKafkaMessageProto, committed_offset: int @@ -155,7 +155,7 @@ def recover_from_changelog_message( batch = WriteBatch(raw_mode=True) # Determine whether the update should be applied or skipped based on the # latest committed offset and processed offset from the changelog message header - if not self._should_skip_changelog( + if self._should_apply_changelog( headers=headers, committed_offset=committed_offset ): key = changelog_message.key() From 3787271060f892c91ec0aef934bfd09c55790e92 Mon Sep 17 00:00:00 2001 From: Daniil Gusev Date: Tue, 7 May 2024 17:13:26 +0200 Subject: [PATCH 27/28] Remove source_topic_name from changelog classes --- quixstreams/state/manager.py | 1 - quixstreams/state/recovery.py | 12 +----------- quixstreams/state/rocksdb/transaction.py | 3 +-- .../test_recovery/test_changelog_producer.py | 4 ---- .../test_rocksdb/test_windowed/fixtures.py | 1 - 5 files changed, 2 insertions(+), 19 deletions(-) diff --git a/quixstreams/state/manager.py b/quixstreams/state/manager.py index 23770bc72..a2a1d51ec 100644 --- a/quixstreams/state/manager.py +++ b/quixstreams/state/manager.py @@ -128,7 +128,6 @@ def _setup_changelogs( ) return ChangelogProducerFactory( changelog_name=changelog_topic.name, - source_topic_name=topic_name, producer=self._producer, ) diff --git a/quixstreams/state/recovery.py b/quixstreams/state/recovery.py index 9e1899224..c26ef6236 100644 --- a/quixstreams/state/recovery.py +++ b/quixstreams/state/recovery.py @@ -134,9 +134,7 @@ class ChangelogProducerFactory: Generates ChangelogProducers, which produce changelog messages to a StorePartition. """ - def __init__( - self, changelog_name: str, source_topic_name: str, producer: RowProducer - ): + def __init__(self, changelog_name: str, producer: RowProducer): """ :param changelog_name: changelog topic name :param producer: a RowProducer (not shared with `Application` instance) @@ -144,7 +142,6 @@ def __init__( :return: a ChangelogWriter instance """ self._changelog_name = changelog_name - self._source_topic_name = source_topic_name self._producer = producer def get_partition_producer(self, partition_num) -> "ChangelogProducer": @@ -156,7 +153,6 @@ def get_partition_producer(self, partition_num) -> "ChangelogProducer": """ return ChangelogProducer( changelog_name=self._changelog_name, - source_topic_name=self._source_topic_name, partition=partition_num, producer=self._producer, ) @@ -171,7 +167,6 @@ class ChangelogProducer: def __init__( self, changelog_name: str, - source_topic_name: str, partition: int, producer: RowProducer, ): @@ -181,14 +176,9 @@ def __init__( :param producer: a RowProducer (not shared with `Application` instance) """ self._changelog_name = changelog_name - self._source_topic_name = source_topic_name self._partition = partition self._producer = producer - @property - def source_topic_name(self) -> str: - return self._source_topic_name - @property def changelog_name(self) -> str: return self._changelog_name diff --git a/quixstreams/state/rocksdb/transaction.py b/quixstreams/state/rocksdb/transaction.py index 03bdccac8..12c99753e 100644 --- a/quixstreams/state/rocksdb/transaction.py +++ b/quixstreams/state/rocksdb/transaction.py @@ -368,8 +368,7 @@ def _produce_changelog(self, processed_offset: Optional[int] = None): if changelog_producer is None: return - source_topic, changelog_topic, partition = ( - changelog_producer.source_topic_name, + changelog_topic, partition = ( changelog_producer.changelog_name, changelog_producer.partition, ) diff --git a/tests/test_quixstreams/test_state/test_recovery/test_changelog_producer.py b/tests/test_quixstreams/test_state/test_recovery/test_changelog_producer.py index 9263a3f9b..d4c58f3ef 100644 --- a/tests/test_quixstreams/test_state/test_recovery/test_changelog_producer.py +++ b/tests/test_quixstreams/test_state/test_recovery/test_changelog_producer.py @@ -29,7 +29,6 @@ def test_produce( producer = ChangelogProducer( changelog_name=changelog.name, partition=p_num, - source_topic_name=source_topic_name, producer=row_producer_factory(), ) producer.produce( @@ -49,7 +48,6 @@ def test_produce( class TestChangelogProducerFactory: def test_get_partition_producer(self, row_producer_factory): changelog_name = "changelog__topic" - source_topic_name = "source-topic" producer = row_producer_factory() p_num = 1 @@ -57,8 +55,6 @@ def test_get_partition_producer(self, row_producer_factory): changelog_producer = ChangelogProducerFactory( changelog_name=changelog_name, producer=producer, - source_topic_name=source_topic_name, ).get_partition_producer(partition_num=p_num) assert changelog_producer.changelog_name == changelog_name assert changelog_producer.partition == p_num - assert changelog_producer.source_topic_name == source_topic_name diff --git a/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/fixtures.py b/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/fixtures.py index b60884185..e147b5844 100644 --- a/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/fixtures.py +++ b/tests/test_quixstreams/test_state/test_rocksdb/test_windowed/fixtures.py @@ -64,7 +64,6 @@ def factory( base_dir=str(tmp_path), changelog_producer_factory=ChangelogProducerFactory( changelog_name=changelog or str(uuid.uuid4()), - source_topic_name=topic, producer=producer or create_autospec(RowProducer)("address"), ), ) From b64069956cdc4d1c7502af89e76e9e9d0f98485b Mon Sep 17 00:00:00 2001 From: Daniil Gusev Date: Tue, 7 May 2024 17:23:16 +0200 Subject: [PATCH 28/28] Re-generate API docs --- docs/api-reference/application.md | 42 +- docs/api-reference/context.md | 6 +- docs/api-reference/dataframe.md | 44 +- docs/api-reference/kafka.md | 61 +- docs/api-reference/quixstreams.md | 7488 ++++++++++++++------------- docs/api-reference/serialization.md | 32 +- docs/api-reference/state.md | 14 +- docs/api-reference/topics.md | 44 +- 8 files changed, 4131 insertions(+), 3600 deletions(-) diff --git a/docs/api-reference/application.md b/docs/api-reference/application.md index f66e0c08e..187faaacc 100644 --- a/docs/api-reference/application.md +++ b/docs/api-reference/application.md @@ -10,7 +10,7 @@ class Application() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/app.py#L55) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/app.py#L59) The main Application class. @@ -62,7 +62,7 @@ def __init__(broker_address: Optional[str] = None, quix_sdk_token: Optional[str] = None, consumer_group: Optional[str] = None, auto_offset_reset: AutoOffsetReset = "latest", - auto_commit_enable: bool = True, + commit_interval: float = 5.0, partitioner: Partitioner = "murmur2", consumer_extra_config: Optional[dict] = None, producer_extra_config: Optional[dict] = None, @@ -81,7 +81,7 @@ def __init__(broker_address: Optional[str] = None, topic_manager: Optional[TopicManager] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/app.py#L93) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/app.py#L97)
@@ -102,9 +102,9 @@ Passed as `group.id` to `confluent_kafka.Consumer`. Linked Environment Variable: `Quix__Consumer__Group`. Default - "quixstreams-default" (set during init) >***NOTE:*** Quix Applications will prefix it with the Quix workspace id. +- `commit_interval`: How often to commit the processed messages in seconds. +Default - 5.0. - `auto_offset_reset`: Consumer `auto.offset.reset` setting -- `auto_commit_enable`: If true, periodically commit offset of -the last message handed to the application. Default - `True`. - `partitioner`: A function to be used to determine the outgoing message partition. - `consumer_extra_config`: A dictionary with additional options that @@ -157,7 +157,6 @@ instead of the default one. def Quix(cls, consumer_group: Optional[str] = None, auto_offset_reset: AutoOffsetReset = "latest", - auto_commit_enable: bool = True, partitioner: Partitioner = "murmur2", consumer_extra_config: Optional[dict] = None, producer_extra_config: Optional[dict] = None, @@ -176,7 +175,7 @@ def Quix(cls, topic_manager: Optional[QuixTopicManager] = None) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/app.py#L296) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/app.py#L303) >***NOTE:*** DEPRECATED: use Application with `quix_sdk_token` argument instead. @@ -224,8 +223,6 @@ Linked Environment Variable: `Quix__Consumer__Group`. Default - "quixstreams-default" (set during init). >***NOTE:*** Quix Applications will prefix it with the Quix workspace id. - `auto_offset_reset`: Consumer `auto.offset.reset` setting -- `auto_commit_enable`: If true, periodically commit offset of -the last message handed to the application. Default - `True`. - `partitioner`: A function to be used to determine the outgoing message partition. - `consumer_extra_config`: A dictionary with additional options that @@ -288,7 +285,7 @@ def topic(name: str, timestamp_extractor: Optional[TimestampExtractor] = None) -> Topic ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/app.py#L436) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/app.py#L439) Create a topic definition. @@ -369,7 +366,7 @@ topic = app.topic("input-topic", timestamp_extractor=custom_ts_extractor) def dataframe(topic: Topic) -> StreamingDataFrame ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/app.py#L516) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/app.py#L519) A simple helper method that generates a `StreamingDataFrame`, which is used @@ -416,10 +413,10 @@ to be used as an input topic. #### Application.stop ```python -def stop() +def stop(fail: bool = False) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/app.py#L552) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/app.py#L556) Stop the internal poll loop and the message processing. @@ -429,6 +426,13 @@ likely through some sort of threading). To otherwise stop an application, either send a `SIGTERM` to the process (like Kubernetes does) or perform a typical `KeyboardInterrupt` (`Ctrl+C`). + +
+***Arguments:*** + +- `fail`: if True, signals that application is stopped due +to unhandled exception, and it shouldn't commit the current checkpoint. +

@@ -439,7 +443,7 @@ To otherwise stop an application, either send a `SIGTERM` to the process def get_producer() -> Producer ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/app.py#L566) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/app.py#L579) Create and return a pre-configured Producer instance. The Producer is initialized with params passed to Application. @@ -471,10 +475,10 @@ with app.get_producer() as producer: #### Application.get\_consumer ```python -def get_consumer() -> Consumer +def get_consumer(auto_commit_enable: bool = True) -> Consumer ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/app.py#L597) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/app.py#L610) Create and return a pre-configured Consumer instance. The Consumer is initialized with params passed to Application. @@ -519,7 +523,7 @@ with app.get_consumer() as consumer: def clear_state() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/app.py#L641) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/app.py#L654) Clear the state of the application. @@ -533,11 +537,11 @@ Clear the state of the application. def run(dataframe: StreamingDataFrame) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/app.py#L719) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/app.py#L660) Start processing data from Kafka using provided `StreamingDataFrame` -One started, can be safely terminated with a `SIGTERM` signal +Once started, it can be safely terminated with a `SIGTERM` signal (like Kubernetes does) or a typical `KeyboardInterrupt` (`Ctrl+C`). diff --git a/docs/api-reference/context.md b/docs/api-reference/context.md index 77be373b9..bbb1a800e 100644 --- a/docs/api-reference/context.md +++ b/docs/api-reference/context.md @@ -12,7 +12,7 @@ def set_message_context(context: Optional[MessageContext]) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/context.py#L21) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/context.py#L21) Set a MessageContext for the current message in the given `contextvars.Context` @@ -55,7 +55,7 @@ sdf = sdf.update(lambda value: alter_context(value)) def message_context() -> MessageContext ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/context.py#L52) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/context.py#L52) Get a MessageContext for the current message, which houses most of the message @@ -96,7 +96,7 @@ instance of `MessageContext` def message_key() -> Any ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/context.py#L83) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/context.py#L83) Get the current message's key. diff --git a/docs/api-reference/dataframe.md b/docs/api-reference/dataframe.md index 67e0c57f8..245c44b2f 100644 --- a/docs/api-reference/dataframe.md +++ b/docs/api-reference/dataframe.md @@ -10,7 +10,7 @@ class StreamingDataFrame(BaseStreaming) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/dataframe.py#L32) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/dataframe.py#L31) `StreamingDataFrame` is the main object you will use for ETL work. @@ -74,7 +74,7 @@ def apply(func: Union[DataFrameFunc, DataFrameStatefulFunc], expand: bool = False) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/dataframe.py#L109) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/dataframe.py#L108) Apply a function to transform the value and return a new value. @@ -122,7 +122,7 @@ def update(func: Union[DataFrameFunc, DataFrameStatefulFunc], stateful: bool = False) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/dataframe.py#L152) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/dataframe.py#L151) Apply a function to mutate value in-place or to perform a side effect @@ -170,7 +170,7 @@ def filter(func: Union[DataFrameFunc, DataFrameStatefulFunc], stateful: bool = False) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/dataframe.py#L191) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/dataframe.py#L190) Filter value using provided function. @@ -218,7 +218,7 @@ of type `State` to perform stateful operations. def contains(key: str) -> StreamingSeries ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/dataframe.py#L244) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/dataframe.py#L233) Check if the key is present in the Row value. @@ -258,7 +258,7 @@ def to_topic(topic: Topic, key: Optional[Callable[[object], object]] = None) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/dataframe.py#L267) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/dataframe.py#L256) Produce current value to a topic. You can optionally specify a new key. @@ -306,7 +306,7 @@ By default, the current message key will be used. def compose() -> StreamCallable ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/dataframe.py#L306) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/dataframe.py#L295) Compose all functions of this StreamingDataFrame into one big closure. @@ -349,7 +349,7 @@ and returns a result of StreamingDataFrame def test(value: object, ctx: Optional[MessageContext] = None) -> Any ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/dataframe.py#L336) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/dataframe.py#L325) A shorthand to test `StreamingDataFrame` with provided value @@ -383,7 +383,7 @@ def tumbling_window(duration_ms: Union[int, timedelta], name: Optional[str] = None) -> TumblingWindowDefinition ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/dataframe.py#L354) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/dataframe.py#L343) Create a tumbling window transformation on this StreamingDataFrame. @@ -468,7 +468,7 @@ def hopping_window(duration_ms: Union[int, timedelta], name: Optional[str] = None) -> HoppingWindowDefinition ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/dataframe.py#L429) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/dataframe.py#L418) Create a hopping window transformation on this StreamingDataFrame. @@ -561,7 +561,7 @@ sdf = ( class StreamingSeries(BaseStreaming) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/series.py#L17) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/series.py#L40) `StreamingSeries` are typically generated by `StreamingDataframes` when getting elements from, or performing certain operations on, a `StreamingDataframe`, @@ -627,7 +627,7 @@ sdf = sdf[["column_a"] & (sdf["new_sum_field"] >= 10)] def from_func(cls, func: StreamCallable) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/series.py#L77) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/series.py#L100) Create a StreamingSeries from a function. @@ -655,7 +655,7 @@ instance of `StreamingSeries` def apply(func: StreamCallable) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/series.py#L91) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/series.py#L114) Add a callable to the execution list for this series. @@ -708,7 +708,7 @@ def compose(allow_filters: bool = True, allow_updates: bool = True) -> StreamCallable ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/series.py#L125) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/series.py#L148) Compose all functions of this StreamingSeries into one big closure. @@ -768,7 +768,7 @@ and returns a result of `StreamingSeries` def test(value: Any, ctx: Optional[MessageContext] = None) -> Any ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/series.py#L172) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/series.py#L195) A shorthand to test `StreamingSeries` with provided value @@ -800,7 +800,7 @@ result of `StreamingSeries` def isin(other: Container) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/series.py#L208) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/series.py#L231) Check if series value is in "other". @@ -845,7 +845,7 @@ new StreamingSeries def contains(other: Union[Self, object]) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/series.py#L235) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/series.py#L258) Check if series value contains "other" @@ -890,7 +890,7 @@ new StreamingSeries def is_(other: Union[Self, object]) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/series.py#L260) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/series.py#L283) Check if series value refers to the same object as `other` @@ -932,7 +932,7 @@ new StreamingSeries def isnot(other: Union[Self, object]) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/series.py#L283) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/series.py#L306) Check if series value does not refer to the same object as `other` @@ -975,7 +975,7 @@ new StreamingSeries def isnull() -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/series.py#L307) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/series.py#L330) Check if series value is None. @@ -1012,7 +1012,7 @@ new StreamingSeries def notnull() -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/series.py#L330) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/series.py#L353) Check if series value is not None. @@ -1049,7 +1049,7 @@ new StreamingSeries def abs() -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/series.py#L353) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/series.py#L376) Get absolute value of the series value. diff --git a/docs/api-reference/kafka.md b/docs/api-reference/kafka.md index 986e1e6fe..5a8e93b16 100644 --- a/docs/api-reference/kafka.md +++ b/docs/api-reference/kafka.md @@ -10,7 +10,7 @@ class Producer() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/producer.py#L54) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/producer.py#L37) @@ -24,7 +24,7 @@ def __init__(broker_address: str, extra_config: Optional[dict] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/producer.py#L55) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/producer.py#L38) A wrapper around `confluent_kafka.Producer`. @@ -61,14 +61,15 @@ def produce(topic: str, partition: Optional[int] = None, timestamp: Optional[int] = None, poll_timeout: float = 5.0, - buffer_error_max_tries: int = 3) + buffer_error_max_tries: int = 3, + on_delivery: Optional[DeliveryCallback] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/producer.py#L94) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/producer.py#L74) -Produce message to topic. +Produce a message to a topic. -It also polls Kafka for callbacks before producing in order to minimize +It also polls Kafka for callbacks before producing to minimize the probability of `BufferError`. If `BufferError` still happens, the method will poll Kafka with timeout to free up the buffer and try again. @@ -86,6 +87,8 @@ to free up the buffer and try again. - `poll_timeout`: timeout for `poll()` call in case of `BufferError` - `buffer_error_max_tries`: max retries for `BufferError`. Pass `0` to not retry after `BufferError`. +- `on_delivery`: the delivery callback to be triggered on `poll()` +for the produced message. @@ -97,7 +100,7 @@ Pass `0` to not retry after `BufferError`. def poll(timeout: float = 0) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/producer.py#L152) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/producer.py#L135) Polls the producer for events and calls `on_delivery` callbacks. @@ -118,7 +121,7 @@ Polls the producer for events and calls `on_delivery` callbacks. def flush(timeout: Optional[float] = None) -> int ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/producer.py#L160) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/producer.py#L143) Wait for all messages in the Producer queue to be delivered. @@ -147,7 +150,7 @@ number of messages remaining to flush class Consumer() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L66) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L66) @@ -166,7 +169,7 @@ def __init__(broker_address: str, extra_config: Optional[dict] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L67) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L67) A wrapper around `confluent_kafka.Consumer`. @@ -208,7 +211,7 @@ Note: values passed as arguments override values in `extra_config`. def poll(timeout: Optional[float] = None) -> Optional[Message] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L126) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L126) Consumes a single message, calls callbacks and returns events. @@ -249,7 +252,7 @@ def subscribe(topics: List[str], on_lost: Optional[RebalancingCallback] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L144) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L144) Set subscription to supplied list of topics @@ -292,7 +295,7 @@ for example, may fail. def unsubscribe() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L238) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L238) Remove current subscription. @@ -312,7 +315,7 @@ def store_offsets(message: Optional[Message] = None, offsets: Optional[List[TopicPartition]] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L246) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L246) .. py:function:: store_offsets([message=None], [offsets=None]) @@ -347,7 +350,7 @@ def commit(message: Optional[Message] = None, asynchronous: bool = True) -> Optional[List[TopicPartition]] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L280) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L280) Commit a message or a list of offsets. @@ -385,7 +388,7 @@ def committed(partitions: List[TopicPartition], timeout: Optional[float] = None) -> List[TopicPartition] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L320) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L320) .. py:function:: committed(partitions, [timeout=None]) @@ -422,7 +425,7 @@ def get_watermark_offsets(partition: TopicPartition, cached: bool = False) -> Tuple[int, int] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L340) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L340) Retrieve low and high offsets for the specified partition. @@ -461,7 +464,7 @@ def list_topics(topic: Optional[str] = None, timeout: Optional[float] = None) -> ClusterMetadata ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L366) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L366) .. py:function:: list_topics([topic=None], [timeout=-1]) @@ -494,7 +497,7 @@ None or -1 is infinite. Default: None def memberid() -> str ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L389) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L389) Return this client's broker-assigned group member id. @@ -517,7 +520,7 @@ def offsets_for_times(partitions: List[TopicPartition], timeout: Optional[float] = None) -> List[TopicPartition] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L402) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L402) Look up offsets by timestamp for the specified partitions. @@ -546,7 +549,7 @@ last message in the partition, a value of -1 will be returned. def pause(partitions: List[TopicPartition]) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L428) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L428) Pause consumption for the provided list of partitions. @@ -574,7 +577,7 @@ Does NOT affect the result of Consumer.assignment(). def resume(partitions: List[TopicPartition]) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L442) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L442) .. py:function:: resume(partitions) @@ -600,7 +603,7 @@ Resume consumption for the provided list of partitions. def position(partitions: List[TopicPartition]) -> List[TopicPartition] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L454) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L454) Retrieve current positions (offsets) for the specified partitions. @@ -633,7 +636,7 @@ the last consumed message + 1. def seek(partition: TopicPartition) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L468) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L468) Set consume position for partition to offset. @@ -665,7 +668,7 @@ pass the offset in an `assign()` call. def assignment() -> List[TopicPartition] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L485) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L485) Returns the current partition assignment. @@ -690,7 +693,7 @@ Returns the current partition assignment. def set_sasl_credentials(username: str, password: str) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L498) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L498) Sets the SASL credentials used for this client. These credentials will overwrite the old ones, and will be used the next @@ -709,7 +712,7 @@ This method is applicable only to SASL PLAIN and SCRAM mechanisms. def incremental_assign(partitions: List[TopicPartition]) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L510) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L510) Assign new partitions. @@ -729,7 +732,7 @@ Any additional partitions besides the ones passed during the `Consumer` def incremental_unassign(partitions: List[TopicPartition]) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L522) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L522) Revoke partitions. @@ -745,7 +748,7 @@ Can be called outside an on_revoke callback. def close() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L530) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L530) Close down and terminate the Kafka Consumer. diff --git a/docs/api-reference/quixstreams.md b/docs/api-reference/quixstreams.md index c4205ca2e..d9bb4c7a2 100644 --- a/docs/api-reference/quixstreams.md +++ b/docs/api-reference/quixstreams.md @@ -2,966 +2,894 @@ ## quixstreams - - -## quixstreams.core - - - -## quixstreams.core.stream - - + -## quixstreams.core.stream.stream +## quixstreams.logging - + -### Stream +#### configure\_logging ```python -class Stream() +def configure_logging(loglevel: Optional[LogLevel]) -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/core/stream/stream.py#L22) - - +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/logging.py#L24) -#### Stream.\_\_init\_\_ +Configure "quixstreams" logger. -```python -def __init__(func: Optional[StreamFunction] = None, - parent: Optional[Self] = None) -``` +>***NOTE:*** If "quixstreams" logger already has pre-defined handlers +(e.g. logging has already been configured via `logging`, or the function +is called twice), it will skip configuration and return `False`. -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/core/stream/stream.py#L23) +**Arguments**: -A base class for all streaming operations. +- `loglevel`: a valid log level as a string or None. +If None passed, this function is no-op and no logging will be configured. -`Stream` is an abstraction of a function pipeline. -Each Stream has a function and a parent (None by default). -When adding new function to the stream, it creates a new `Stream` object and -sets "parent" to the previous `Stream` to maintain an order of execution. +**Returns**: -Streams supports 3 types of functions: -- "Apply" - generate new values based on a previous one. - The result of an Apply function is passed downstream to the next functions. - If "expand=True" is passed and the function returns an `Iterable`, - each item of it will be treated as a separate value downstream. -- "Update" - update values in-place. - The result of an Update function is always ignored, and its input is passed - downstream. -- "Filter" - to filter values from the Stream. - The result of a Filter function is interpreted as boolean. - If it's `True`, the input will be passed downstream. - If it's `False`, the `Filtered` exception will be raised to signal that the - value is filtered out. +True if logging config has been updated, otherwise False. -To execute the functions on the `Stream`, call `.compose()` method, and -it will return a closure to execute all the functions accumulated in the Stream -and its parents. + -**Arguments**: +## quixstreams.error\_callbacks -- `func`: a function to be called on the stream. -It is expected to be wrapped into one of "Apply", "Filter" or "Update" from -`quixstreams.core.stream.functions` package. -Default - "Apply(lambda v: v)". -- `parent`: a parent `Stream` + - +## quixstreams.platforms -#### Stream.add\_filter + -```python -def add_filter(func: Callable[[T], R]) -> Self -``` +## quixstreams.platforms.quix.config -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/core/stream/stream.py#L79) + -Add a function to filter values from the Stream. +### TopicCreationConfigs -The return value of the function will be interpreted as `bool`. -If the function returns `False`-like result, the Stream will raise `Filtered` -exception during execution. +```python +@dataclasses.dataclass +class TopicCreationConfigs() +``` -**Arguments**: +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/config.py#L59) -- `func`: a function to filter values from the stream + -**Returns**: +#### name -a new `Stream` derived from the current one +Required when not created by a Quix App. - + -#### Stream.add\_apply +#### strip\_workspace\_id\_prefix ```python -def add_apply(func: Callable[[T], R], expand: bool = False) -> Self +def strip_workspace_id_prefix(workspace_id: str, s: str) -> str ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/core/stream/stream.py#L92) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/config.py#L68) -Add an "apply" function to the Stream. +Remove the workspace ID from a given string if it starts with it, -The function is supposed to return a new value, which will be passed -further during execution. +typically a topic or consumer group id **Arguments**: -- `func`: a function to generate a new value -- `expand`: if True, expand the returned iterable into individual values -downstream. If returned value is not iterable, `TypeError` will be raised. -Default - `False`. +- `workspace_id`: the workspace id +- `s`: the string to append to **Returns**: -a new `Stream` derived from the current one +the string with workspace_id prefix removed - + -#### Stream.add\_update +#### prepend\_workspace\_id ```python -def add_update(func: Callable[[T], object]) -> Self +def prepend_workspace_id(workspace_id: str, s: str) -> str ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/core/stream/stream.py#L109) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/config.py#L80) -Add an "update" function to the Stream, that will mutate the input value. +Add the workspace ID as a prefix to a given string if it does not have it, -The return of this function will be ignored and its input -will be passed downstream. +typically a topic or consumer group it **Arguments**: -- `func`: a function to mutate the value +- `workspace_id`: the workspace id +- `s`: the string to append to **Returns**: -a new Stream derived from the current one +the string with workspace_id prepended - + -#### Stream.diff +### QuixKafkaConfigsBuilder ```python -def diff(other: "Stream") -> Self +class QuixKafkaConfigsBuilder() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/core/stream/stream.py#L121) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/config.py#L92) -Takes the difference between Streams `self` and `other` based on their last +Retrieves all the necessary information from the Quix API and builds all the +objects required to connect a confluent-kafka client to the Quix Platform. -common parent, and returns a new `Stream` that includes only this difference. +If not executed within the Quix platform directly, you must provide a Quix +"streaming" (aka "sdk") token, or Personal Access Token. -It's impossible to calculate a diff when: - - Streams don't have a common parent. - - When the `self` Stream already includes all the nodes from - the `other` Stream, and the resulting diff is empty. +Ideally you also know your workspace name or id. If not, you can search for it +using a known topic name, but note the search space is limited to the access level +of your token. -**Arguments**: +It also currently handles the app_auto_create_topics setting for Application.Quix. -- `other`: a `Stream` to take a diff from. + -**Raises**: +#### QuixKafkaConfigsBuilder.\_\_init\_\_ -- `ValueError`: if Streams don't have a common parent -or if the diff is empty. +```python +def __init__(quix_sdk_token: Optional[str] = None, + workspace_id: Optional[str] = None, + workspace_cert_path: Optional[str] = None, + quix_portal_api_service: Optional[QuixPortalApiService] = None) +``` -**Returns**: +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/config.py#L108) -new `Stream` instance including all the Streams from the diff +**Arguments**: - +- `quix_portal_api_service`: A QuixPortalApiService instance (else generated) +- `workspace_id`: A valid Quix Workspace ID (else searched for) +- `workspace_cert_path`: path to an existing workspace cert (else retrieved) -#### Stream.tree + + +#### QuixKafkaConfigsBuilder.strip\_workspace\_id\_prefix ```python -def tree() -> List[Self] +def strip_workspace_id_prefix(s: str) -> str ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/core/stream/stream.py#L150) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/config.py#L184) -Return a list of all parent Streams including the node itself. +Remove the workspace ID from a given string if it starts with it, -The tree is ordered from child to parent (current node comes first). +typically a topic or consumer group id + +**Arguments**: + +- `s`: the string to append to **Returns**: -a list of `Stream` objects +the string with workspace_id prefix removed - + -#### Stream.compose +#### QuixKafkaConfigsBuilder.prepend\_workspace\_id ```python -def compose(allow_filters: bool = True, - allow_updates: bool = True, - allow_expands: bool = True) -> Callable[[T], R] +def prepend_workspace_id(s: str) -> str ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/core/stream/stream.py#L164) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/config.py#L194) -Compose a list of functions from this `Stream` and its parents into one - -big closure using a "composer" function. +Add the workspace ID as a prefix to a given string if it does not have it, -Closures are more performant than calling all the functions in the -`Stream.tree()` one-by-one. +typically a topic or consumer group it **Arguments**: -- `allow_filters`: If False, this function will fail with `ValueError` if -the stream has filter functions in the tree. Default - True. -- `allow_updates`: If False, this function will fail with `ValueError` if -the stream has update functions in the tree. Default - True. -- `allow_expands`: If False, this function will fail with `ValueError` if -the stream has functions with "expand=True" in the tree. Default - True. - -**Raises**: - -- `ValueError`: if disallowed functions are present in the stream tree. +- `s`: the string to append to - +**Returns**: -## quixstreams.core.stream.functions +the string with workspace_id prepended - + -### StreamFunction +#### QuixKafkaConfigsBuilder.search\_for\_workspace ```python -class StreamFunction(abc.ABC) +def search_for_workspace( + workspace_name_or_id: Optional[str] = None) -> Optional[dict] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/core/stream/functions.py#L26) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/config.py#L204) -A base class for all the streaming operations in Quix Streams. +Search for a workspace given an expected workspace name or id. -It provides two methods that return closures to be called on the input values: -- `get_executor` - a wrapper to execute on a single value -- `get_executor_expanded` - a wrapper to execute on an expanded value. - Expanded value is a list, where each item should be treated as a separate value. +**Arguments**: - +- `workspace_name_or_id`: the expected name or id of a workspace -#### StreamFunction.func +**Returns**: + +the workspace data dict if search success, else None + + + +#### QuixKafkaConfigsBuilder.get\_workspace\_info ```python -@property -def func() -> StreamCallable +def get_workspace_info(known_workspace_topic: Optional[str] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/core/stream/functions.py#L43) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/config.py#L227) -The original function +Queries for workspace data from the Quix API, regardless of instance cache, - +and updates instance attributes from query result. -#### StreamFunction.get\_executor +**Arguments**: + +- `known_workspace_topic`: a topic you know to exist in some workspace + + + +#### QuixKafkaConfigsBuilder.search\_workspace\_for\_topic ```python -@abc.abstractmethod -def get_executor() -> StreamCallable +def search_workspace_for_topic(workspace_id: str, topic: str) -> Optional[str] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/core/stream/functions.py#L50) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/config.py#L254) -Returns a wrapper to be called on a single value. +Search through all the topics in the given workspace id to see if there is a - +match with the provided topic. -#### StreamFunction.get\_executor\_expanded +**Arguments**: -```python -@abc.abstractmethod -def get_executor_expanded() -> StreamCallable -``` +- `workspace_id`: the workspace to search in +- `topic`: the topic to search for -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/core/stream/functions.py#L56) +**Returns**: -Returns a wrapper to be called on a list of expanded values. +the workspace_id if success, else None - + -### ApplyFunction +#### QuixKafkaConfigsBuilder.search\_for\_topic\_workspace ```python -class ApplyFunction(StreamFunction) +def search_for_topic_workspace(topic: str) -> Optional[dict] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/core/stream/functions.py#L62) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/config.py#L270) -Wrap a function into "Apply" function. +Find what workspace a topic belongs to. -The provided function is expected to return a new value based on input, -and its result will always be passed downstream. +If there is only one workspace altogether, it is assumed to be the workspace. +More than one means each workspace will be searched until the first hit. - +**Arguments**: -### ApplyExpandFunction +- `topic`: the topic to search for + +**Returns**: + +workspace data dict if topic search success, else None + + + +#### QuixKafkaConfigsBuilder.get\_workspace\_ssl\_cert ```python -class ApplyExpandFunction(StreamFunction) +def get_workspace_ssl_cert( + extract_to_folder: Optional[Path] = None) -> Optional[str] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/core/stream/functions.py#L85) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/config.py#L291) -Wrap a function into "Apply" function and expand the returned iterable -into separate values downstream. +Gets and extracts zipped certificate from the API to provided folder if the -The provided function is expected to return an `Iterable`. -If the returned value is not `Iterable`, `TypeError` will be raised. +SSL certificate is specified in broker configuration. - +If no path was provided, will dump to /tmp. Expects cert named 'ca.cert'. -### FilterFunction +**Arguments**: -```python -class FilterFunction(StreamFunction) -``` +- `extract_to_folder`: path to folder to dump zipped cert file to -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/core/stream/functions.py#L114) +**Returns**: -Wraps a function into a "Filter" function. -The result of a Filter function is interpreted as boolean. -If it's `True`, the input will be return downstream. -If it's `False`, the `Filtered` exception will be raised to signal that the -value is filtered out. +full cert filepath as string or `None` if certificate is not specified - + -### UpdateFunction +#### QuixKafkaConfigsBuilder.create\_topics ```python -class UpdateFunction(StreamFunction) +def create_topics(topics: List[Topic], + finalize_timeout_seconds: Optional[int] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/core/stream/functions.py#L146) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/config.py#L375) -Wrap a function into an "Update" function. +Create topics in a Quix cluster. -The provided function is expected to mutate the value -or to perform some side effect. -Its result will always be ignored, and its input is passed -downstream. +**Arguments**: - +- `topics`: a list of `Topic` objects +- `finalize_timeout_seconds`: How long to wait for the topics to be +marked as "Ready" (and thus ready to produce to/consume from). -#### compose + + +#### QuixKafkaConfigsBuilder.get\_topic ```python -def compose(functions: List[StreamFunction], - allow_filters: bool = True, - allow_updates: bool = True, - allow_expands: bool = True) -> StreamCallable +def get_topic(topic_name: str) -> Optional[dict] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/core/stream/functions.py#L175) - -Composes a list of functions and its parents into a single +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/config.py#L421) -big closure like this: -``` -[func, func, func] -> func(func(func())) -``` +return the topic ID (the actual cluster topic name) if it exists, else None -Closures are more performant than calling all functions one by one in a loop. +>***NOTE***: if the name registered in Quix is instead the workspace-prefixed +version, this returns None unless that exact name was created WITHOUT the +Quix API. **Arguments**: -- `functions`: list of `StreamFunction` objects to compose -- `allow_filters`: If False, will fail with `ValueError` if -the list has `FilterFunction`. Default - True. -- `allow_updates`: If False, will fail with `ValueError` if -the list has `UpdateFunction`. Default - True. -- `allow_expands`: If False, will fail with `ValueError` if -the list has `ApplyFunction` with "expand=True". Default - True. +- `topic_name`: name of the topic -**Raises**: +**Returns**: -- `ValueError`: if disallowed functions are present in the list of functions. +response dict of the topic info if topic found, else None - + -#### composer +#### QuixKafkaConfigsBuilder.confirm\_topics\_exist ```python -def composer(outer_func: StreamCallable, - inner_func: StreamCallable) -> Callable[[T], R] +def confirm_topics_exist(topics: Union[List[Topic], List[str]]) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/core/stream/functions.py#L227) - -A function that wraps two other functions into a closure. - -It passes the result of the inner function as an input to the outer function. - -**Returns**: +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/config.py#L443) -a function with one argument (value) +Confirm whether the desired set of topics exists in the Quix workspace. - +**Arguments**: -## quixstreams.dataframe.utils +- `topics`: a list of `Topic` or topic names - + -#### ensure\_milliseconds +#### QuixKafkaConfigsBuilder.get\_confluent\_broker\_config ```python -def ensure_milliseconds(delta: Union[int, timedelta]) -> int +def get_confluent_broker_config(known_topic: Optional[str] = None) -> dict ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/utils.py#L5) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/config.py#L483) -Convert timedelta to milliseconds. +Get the full client config dictionary required to authenticate a confluent-kafka -If the `delta` is not -This function will also round the value to the closest milliseconds in case of -higher precision. +client to a Quix platform broker/workspace. + +The returned config can be used directly by any confluent-kafka-python consumer/ +producer (add your producer/consumer-specific configs afterward). **Arguments**: -- `delta`: `timedelta` object +- `known_topic`: a topic known to exist in some workspace **Returns**: -timedelta value in milliseconds as `int` - - +a dict of confluent-kafka-python client settings (see librdkafka +config for more details) -## quixstreams.dataframe.windows + - +#### QuixKafkaConfigsBuilder.get\_confluent\_client\_configs -## quixstreams.dataframe.windows.base +```python +def get_confluent_client_configs( + topics: list, + consumer_group_id: Optional[str] = None +) -> Tuple[dict, List[str], Optional[str]] +``` - +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/config.py#L528) -#### get\_window\_ranges +Get all the values you need in order to use a confluent_kafka-based client -```python -def get_window_ranges(timestamp_ms: int, - duration_ms: int, - step_ms: Optional[int] = None) -> List[Tuple[int, int]] -``` +with a topic on a Quix platform broker/workspace. -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/windows/base.py#L22) +The returned config can be used directly by any confluent-kafka-python consumer/ +producer (add your producer/consumer-specific configs afterward). -Get a list of window ranges for the given timestamp. +The topics and consumer group are appended with any necessary values. **Arguments**: -- `timestamp_ms`: timestamp in milliseconds -- `duration_ms`: window duration in milliseconds -- `step_ms`: window step in milliseconds for hopping windows, optional. +- `topics`: list of topics +- `consumer_group_id`: consumer group id, if needed **Returns**: -a list of (, ) tuples +a tuple with configs and altered versions of the topics +and consumer group name - + -## quixstreams.dataframe.windows.time\_based +## quixstreams.platforms.quix.env - + -### FixedTimeWindow +### QuixEnvironment ```python -class FixedTimeWindow() +class QuixEnvironment() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/windows/time_based.py#L32) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/env.py#L7) - +Class to access various Quix platform environment settings -#### FixedTimeWindow.final + + +#### QuixEnvironment.state\_management\_enabled ```python -def final(expand: bool = True) -> "StreamingDataFrame" +@property +def state_management_enabled() -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/windows/time_based.py#L95) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/env.py#L19) -Apply the window aggregation and return results only when the windows are +Check whether "State management" is enabled for the current deployment -closed. +**Returns**: + +True if state management is enabled, otherwise False + + + +#### QuixEnvironment.deployment\_id -The format of returned windows: ```python -{ - "start": , - "end": , - "value: , -} +@property +def deployment_id() -> Optional[str] ``` -The individual window is closed when the event time -(the maximum observed timestamp across the partition) passes -its end timestamp + grace period. -The closed windows cannot receive updates anymore and are considered final. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/env.py#L27) ->***NOTE:*** Windows can be closed only within the same message key. -If some message keys appear irregularly in the stream, the latest windows -can remain unprocessed until the message the same key is received. +Return current Quix deployment id. -**Arguments**: +This variable is meant to be set only by Quix Platform and only +when the application is deployed. -- `expand`: if `True`, each window result will be sent downstream as -an individual item. Otherwise, the list of window results will be sent. -Default - `True` +**Returns**: - +deployment id or None -#### FixedTimeWindow.current + + +#### QuixEnvironment.workspace\_id ```python -def current(expand: bool = True) -> "StreamingDataFrame" +@property +def workspace_id() -> Optional[str] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/windows/time_based.py#L132) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/env.py#L39) -Apply the window transformation to the StreamingDataFrame to return results +Return Quix workspace id if set -for each updated window. +**Returns**: + +workspace id or None + + + +#### QuixEnvironment.portal\_api -The format of returned windows: ```python -{ - "start": , - "end": , - "value: , -} +@property +def portal_api() -> Optional[str] ``` -This method processes streaming data and returns results as they come, -regardless of whether the window is closed or not. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/env.py#L47) -**Arguments**: +Return Quix Portal API url if set -- `expand`: if `True`, each window result will be sent downstream as -an individual item. Otherwise, the list of window results will be sent. -Default - `True` - - +**Returns**: -## quixstreams.dataframe.windows.definitions +portal API URL or None - + -### FixedTimeWindowDefinition +#### QuixEnvironment.state\_dir ```python -class FixedTimeWindowDefinition(abc.ABC) +@property +def state_dir() -> str ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/windows/definitions.py#L20) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/env.py#L56) - +Return application state directory on Quix. -#### FixedTimeWindowDefinition.sum +**Returns**: -```python -def sum() -> "FixedTimeWindow" -``` +path to state dir -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/windows/definitions.py#L67) + -Configure the window to aggregate data by summing up values within +## quixstreams.platforms.quix.checks -each window period. + -**Returns**: +#### check\_state\_management\_enabled -an instance of `FixedTimeWindow` configured to perform sum aggregation. +```python +def check_state_management_enabled() +``` - +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/checks.py#L11) -#### FixedTimeWindowDefinition.count +Check if State Management feature is enabled for the current deployment on +Quix platform. +If it's disabled, the exception will be raised. + + + +#### check\_state\_dir ```python -def count() -> "FixedTimeWindow" +def check_state_dir(state_dir: str) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/windows/definitions.py#L94) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/checks.py#L28) -Configure the window to aggregate data by counting the number of values +Check if Application "state_dir" matches the state dir on Quix platform. -within each window period. +If it doesn't match, the warning will be logged. -**Returns**: +**Arguments**: -an instance of `FixedTimeWindow` configured to perform record count. +- `state_dir`: application state_dir path - + -#### FixedTimeWindowDefinition.mean +## quixstreams.platforms.quix + + + +## quixstreams.platforms.quix.api + + + +### QuixPortalApiService ```python -def mean() -> "FixedTimeWindow" +class QuixPortalApiService() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/windows/definitions.py#L121) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/api.py#L19) -Configure the window to aggregate data by calculating the mean of the values +A light wrapper around the Quix Portal Api. If used in the Quix Platform, it will +use that workspaces auth token and portal endpoint, else you must provide it. -within each window period. +Function names closely reflect the respective API endpoint, +each starting with the method [GET, POST, etc.] followed by the endpoint path. -**Returns**: +Results will be returned in the form of request's Response.json(), unless something +else is required. Non-200's will raise exceptions. -an instance of `FixedTimeWindow` configured to calculate the mean -of the values. +See the swagger documentation for more info about the endpoints. - + -#### FixedTimeWindowDefinition.reduce +#### QuixPortalApiService.get\_workspace\_certificate ```python -def reduce(reducer: Callable[[Any, Any], Any], - initializer: Callable[[Any], Any]) -> "FixedTimeWindow" +def get_workspace_certificate( + workspace_id: Optional[str] = None) -> Optional[bytes] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/windows/definitions.py#L152) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/api.py#L112) -Configure the window to perform a custom aggregation using `reducer` +Get a workspace TLS certificate if available. -and `initializer` functions. +Returns `None` if certificate is not specified. -Example Snippet: -```python -sdf = StreamingDataFrame(...) +**Arguments**: -# Using "reduce()" to calculate multiple aggregates at once -def reducer(agg: dict, current: int): - aggregated = { - 'min': min(agg['min'], current), - 'max': max(agg['max'], current) - 'count': agg['count'] + 1 - } - return aggregated +- `workspace_id`: workspace id, optional -def initializer(current) -> dict: - return {'min': current, 'max': current, 'count': 1} +**Returns**: -window = ( - sdf.tumbling_window(duration_ms=1000) - .reduce(reducer=reducer, initializer=initializer) - .final() -) -``` +certificate as bytes if present, or None -**Arguments**: + -- `reducer`: A function that takes two arguments -(the accumulated value and a new value) and returns a single value. -The returned value will be saved to the state store and sent downstream. -- `initializer`: A function to call for every first element of the window. -This function is used to initialize the aggregation within a window. +## quixstreams.platforms.quix.exceptions -**Returns**: + -A window configured to perform custom reduce aggregation on the data. +## quixstreams.platforms.quix.topic\_manager - + -#### FixedTimeWindowDefinition.max +### QuixTopicManager ```python -def max() -> "FixedTimeWindow" +class QuixTopicManager(TopicManager) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/windows/definitions.py#L212) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/topic_manager.py#L9) -Configure a window to aggregate the maximum value within each window period. +The source of all topic management with quixstreams. -**Returns**: +This is specifically for Applications using the Quix platform. -an instance of `FixedTimeWindow` configured to calculate the maximum -value within each window period. +Generally initialized and managed automatically by an `Application.Quix`, +but allows a user to work with it directly when needed, such as using it alongside +a plain `Producer` to create its topics. - +See methods for details. -#### FixedTimeWindowDefinition.min + + +#### QuixTopicManager.\_\_init\_\_ ```python -def min() -> "FixedTimeWindow" +def __init__(topic_admin: TopicAdmin, + quix_config_builder: QuixKafkaConfigsBuilder, + create_timeout: int = 60) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/windows/definitions.py#L241) - -Configure a window to aggregate the minimum value within each window period. - -**Returns**: - -an instance of `FixedTimeWindow` configured to calculate the maximum -value within each window period. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/platforms/quix/topic_manager.py#L30) - +**Arguments**: -## quixstreams.dataframe +- `topic_admin`: an `Admin` instance +- `create_timeout`: timeout for topic creation +- `quix_config_builder`: A QuixKafkaConfigsBuilder instance, else one is +generated for you. - + -## quixstreams.dataframe.series +## quixstreams.dataframe.dataframe - + -### StreamingSeries +### StreamingDataFrame ```python -class StreamingSeries(BaseStreaming) +class StreamingDataFrame(BaseStreaming) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/series.py#L17) - -`StreamingSeries` are typically generated by `StreamingDataframes` when getting -elements from, or performing certain operations on, a `StreamingDataframe`, -thus acting as a representation of "column" value. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/dataframe.py#L31) -They share some operations with the `StreamingDataframe`, but also provide some -additional functionality. +`StreamingDataFrame` is the main object you will use for ETL work. -Most column value operations are handled by this class, and `StreamingSeries` can -generate other `StreamingSeries` as a result of said operations. +Typically created with an `app = quixstreams.app.Application()` instance, +via `sdf = app.dataframe()`. What it Does: -- Allows ways to do simple operations with dataframe "column"/dictionary values: - - Basic ops like add, subtract, modulo, etc. -- Enables comparisons/inequalities: - - Greater than, equals, etc. - - and/or, is/not operations -- Can check for existence of columns in `StreamingDataFrames` -- Enables chaining of various operations together +- Builds a data processing pipeline, declaratively (not executed immediately) + - Executes this pipeline on inputs at runtime (Kafka message values) +- Provides functions/interface similar to Pandas Dataframes/Series +- Enables stateful processing (and manages everything related to it) How to Use: -For the most part, you may not even notice this class exists! -They will naturally be created as a result of typical `StreamingDataFrame` use. +Define various operations while continuously reassigning to itself (or new fields). -Auto-complete should help you with valid methods and type-checking should alert -you to invalid operations between `StreamingSeries`. +These operations will generally transform your data, access/update state, or produce +to kafka topics. -In general, any typical Pands dataframe operation between columns should be valid -with `StreamingSeries`, and you shouldn't have to think about them explicitly. +We recommend your data structure to be "columnar" (aka a dict/JSON) in nature so +that it works with the entire interface, but simple types like `ints`, `str`, etc. +are also supported. + +See the various methods and classes for more specifics, or for a deep dive into +usage, see `streamingdataframe.md` under the `docs/` folder. + +>***NOTE:*** column referencing like `sdf["a_column"]` and various methods often + create other object types (typically `quixstreams.dataframe.StreamingSeries`), + which is expected; type hinting should alert you to any issues should you + attempt invalid operations with said objects (however, we cannot infer whether + an operation is valid with respect to your data!). Example Snippet: ```python -# Random methods for example purposes. More detailed explanations found under -# various methods or in the docs folder. - sdf = StreamingDataframe() -sdf = sdf["column_a"].apply(a_func).apply(diff_func, stateful=True) -sdf["my_new_bool_field"] = sdf["column_b"].contains("this_string") -sdf["new_sum_field"] = sdf["column_c"] + sdf["column_d"] + 2 -sdf = sdf[["column_a"] & (sdf["new_sum_field"] >= 10)] +sdf = sdf.apply(a_func) +sdf = sdf.filter(another_func) +sdf = sdf.to_topic(topic_obj) ``` - + -#### StreamingSeries.from\_func +#### StreamingDataFrame.apply ```python -@classmethod -def from_func(cls, func: StreamCallable) -> Self +def apply(func: Union[DataFrameFunc, DataFrameStatefulFunc], + stateful: bool = False, + expand: bool = False) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/series.py#L77) - -Create a StreamingSeries from a function. - -The provided function will be wrapped into `Apply` - -**Arguments**: - -- `func`: a function to apply +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/dataframe.py#L108) -**Returns**: +Apply a function to transform the value and return a new value. -instance of `StreamingSeries` +The result will be passed downstream as an input value. - -#### StreamingSeries.apply +Example Snippet: ```python -def apply(func: StreamCallable) -> Self -``` - -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/series.py#L91) - -Add a callable to the execution list for this series. - -The provided callable should accept a single argument, which will be its input. -The provided callable should similarly return one output, or None - -They can be chained together or included with other operations. - - -Example Snippet: - -```python -# The `StreamingSeries` are generated when `sdf["COLUMN_NAME"]` is called. -# This stores a string in state and capitalizes the column value; the result is -# assigned to a new column. -# Another apply converts a str column to an int, assigning it to a new column. - -def func(value: str, state: State): +# This stores a string in state and capitalizes every column with a string value. +# A second apply then keeps only the string value columns (shows non-stateful). +def func(d: dict, state: State): + value = d["store_field"] if value != state.get("my_store_key"): state.set("my_store_key") = value - return v.upper() + return {k: v.upper() if isinstance(v, str) else v for k, v in d.items()} sdf = StreamingDataframe() -sdf["new_col"] = sdf["a_column"]["nested_dict_key"].apply(func, stateful=True) -sdf["new_col_2"] = sdf["str_col"].apply(lambda v: int(v)) + sdf["str_col2"] + 2 +sdf = sdf.apply(func, stateful=True) +sdf = sdf.apply(lambda d: {k: v for k,v in d.items() if isinstance(v, str)}) + ``` **Arguments**: -- `func`: a callable with one argument and one output - -**Returns**: - -a new `StreamingSeries` with the new callable added +- `func`: a function to apply +- `stateful`: if `True`, the function will be provided with a second argument +of type `State` to perform stateful operations. +- `expand`: if True, expand the returned iterable into individual values +downstream. If returned value is not iterable, `TypeError` will be raised. +Default - `False`. - + -#### StreamingSeries.compose +#### StreamingDataFrame.update ```python -def compose(allow_filters: bool = True, - allow_updates: bool = True) -> StreamCallable +def update(func: Union[DataFrameFunc, DataFrameStatefulFunc], + stateful: bool = False) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/series.py#L125) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/dataframe.py#L151) -Compose all functions of this StreamingSeries into one big closure. +Apply a function to mutate value in-place or to perform a side effect -Closures are more performant than calling all the functions in the -`StreamingDataFrame` one-by-one. +that doesn't update the value (e.g. print a value to the console). -Generally not required by users; the `quixstreams.app.Application` class will -do this automatically. +The result of the function will be ignored, and the original value will be +passed downstream. Example Snippet: ```python -from quixstreams import Application - -app = Application(...) +# Stores a value and mutates a list by appending a new item to it. +# Also prints to console. -sdf = app.dataframe() -sdf = sdf["column_a"].apply(apply_func) -sdf = sdf["column_b"].contains(filter_func) -sdf = sdf.compose() +def func(values: list, state: State): + value = values[0] + if value != state.get("my_store_key"): + state.set("my_store_key") = value + values.append("new_item") -result_0 = sdf({"my": "record"}) -result_1 = sdf({"other": "record"}) +sdf = StreamingDataframe() +sdf = sdf.update(func, stateful=True) +sdf = sdf.update(lambda value: print("Received value: ", value)) ``` **Arguments**: -- `allow_filters`: If False, this function will fail with ValueError if -the stream has filter functions in the tree. Default - True. -- `allow_updates`: If False, this function will fail with ValueError if -the stream has update functions in the tree. Default - True. - -**Raises**: - -- `ValueError`: if disallowed functions are present in the tree of -underlying `Stream`. - -**Returns**: - -a function that accepts "value" -and returns a result of `StreamingSeries` +- `func`: function to update value +- `stateful`: if `True`, the function will be provided with a second argument +of type `State` to perform stateful operations. - + -#### StreamingSeries.test +#### StreamingDataFrame.filter ```python -def test(value: Any, ctx: Optional[MessageContext] = None) -> Any +def filter(func: Union[DataFrameFunc, DataFrameStatefulFunc], + stateful: bool = False) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/series.py#L172) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/dataframe.py#L190) -A shorthand to test `StreamingSeries` with provided value +Filter value using provided function. -and `MessageContext`. +If the function returns True-like value, the original value will be +passed downstream. +Otherwise, the `Filtered` exception will be raised (further processing for that +message will be skipped). -**Arguments**: -- `value`: value to pass through `StreamingSeries` -- `ctx`: instance of `MessageContext`, optional. -Provide it if the StreamingSeries instance has -functions calling `get_current_key()`. -Default - `None`. +Example Snippet: -**Returns**: +```python +# Stores a value and allows further processing only if the value is greater than +# what was previously stored. -result of `StreamingSeries` +def func(d: dict, state: State): + value = d["my_value"] + if value > state.get("my_store_key"): + state.set("my_store_key") = value + return True + return False - +sdf = StreamingDataframe() +sdf = sdf.filter(func, stateful=True) +``` -#### StreamingSeries.isin +**Arguments**: -```python -def isin(other: Container) -> Self -``` +- `func`: function to filter value +- `stateful`: if `True`, the function will be provided with second argument +of type `State` to perform stateful operations. -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/series.py#L208) + -Check if series value is in "other". +#### StreamingDataFrame.contains -Same as "StreamingSeries in other". +```python +@staticmethod +def contains(key: str) -> StreamingSeries +``` -Runtime result will be a `bool`. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/dataframe.py#L233) +Check if the key is present in the Row value. Example Snippet: ```python -from quixstreams import Application - -# Check if "str_column" is contained in a column with a list of strings and -# assign the resulting `bool` to a new column: "has_my_str". +# Add new column 'has_column' which contains a boolean indicating +# the presence of 'column_x' -sdf = app.dataframe() -sdf["has_my_str"] = sdf["str_column"].isin(sdf["column_with_list_of_strs"]) +sdf = StreamingDataframe() +sdf['has_column'] = sdf.contains('column_x') ``` **Arguments**: -- `other`: a container to check +- `key`: a column name to check. **Returns**: -new StreamingSeries +a Column object that evaluates to True if the key is present +or False otherwise. - + -#### StreamingSeries.contains +#### StreamingDataFrame.to\_topic ```python -def contains(other: Union[Self, object]) -> Self +def to_topic(topic: Topic, + key: Optional[Callable[[object], object]] = None) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/series.py#L235) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/dataframe.py#L256) -Check if series value contains "other" - -Same as "other in StreamingSeries". +Produce current value to a topic. You can optionally specify a new key. -Runtime result will be a `bool`. +>***NOTE:*** A `RowProducer` instance must be assigned to +`StreamingDataFrame.producer` if not using :class:`quixstreams.app.Application` + to facilitate the execution of StreamingDataFrame. Example Snippet: @@ -969,421 +897,471 @@ Example Snippet: ```python from quixstreams import Application -# Check if "column_a" contains "my_substring" and assign the resulting -# `bool` to a new column: "has_my_substr" +# Produce to two different topics, changing the key for one of them. -sdf = app.dataframe() -sdf["has_my_substr"] = sdf["column_a"].contains("my_substring") +app = Application() +input_topic = app.topic("input_x") +output_topic_0 = app.topic("output_a") +output_topic_1 = app.topic("output_b") + +sdf = app.dataframe(input_topic) +sdf = sdf.to_topic(output_topic_0) +sdf = sdf.to_topic(output_topic_1, key=lambda data: data["a_field"]) ``` **Arguments**: -- `other`: object to check - -**Returns**: - -new StreamingSeries +- `topic`: instance of `Topic` +- `key`: a callable to generate a new message key, optional. +If passed, the return type of this callable must be serializable +by `key_serializer` defined for this Topic object. +By default, the current message key will be used. - + -#### StreamingSeries.is\_ +#### StreamingDataFrame.compose ```python -def is_(other: Union[Self, object]) -> Self +def compose() -> StreamCallable ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/series.py#L260) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/dataframe.py#L295) -Check if series value refers to the same object as `other` +Compose all functions of this StreamingDataFrame into one big closure. -Runtime result will be a `bool`. +Closures are more performant than calling all the functions in the +`StreamingDataFrame` one-by-one. + +Generally not required by users; the `quixstreams.app.Application` class will +do this automatically. Example Snippet: ```python -# Check if "column_a" is the same as "column_b" and assign the resulting `bool` -# to a new column: "is_same" - from quixstreams import Application sdf = app.dataframe() -sdf["is_same"] = sdf["column_a"].is_(sdf["column_b"]) -``` - -**Arguments**: +sdf = sdf.apply(apply_func) +sdf = sdf.filter(filter_func) +sdf = sdf.compose() -- `other`: object to check for "is" +result_0 = sdf({"my": "record"}) +result_1 = sdf({"other": "record"}) +``` **Returns**: -new StreamingSeries +a function that accepts "value" +and returns a result of StreamingDataFrame - + -#### StreamingSeries.isnot +#### StreamingDataFrame.test ```python -def isnot(other: Union[Self, object]) -> Self +def test(value: object, ctx: Optional[MessageContext] = None) -> Any ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/series.py#L283) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/dataframe.py#L325) -Check if series value does not refer to the same object as `other` +A shorthand to test `StreamingDataFrame` with provided value -Runtime result will be a `bool`. +and `MessageContext`. +**Arguments**: -Example Snippet: +- `value`: value to pass through `StreamingDataFrame` +- `ctx`: instance of `MessageContext`, optional. +Provide it if the StreamingDataFrame instance calls `to_topic()`, +has stateful functions or functions calling `get_current_key()`. +Default - `None`. -```python -from quixstreams import Application +**Returns**: -# Check if "column_a" is the same as "column_b" and assign the resulting `bool` -# to a new column: "is_not_same" +result of `StreamingDataFrame` -sdf = app.dataframe() -sdf["is_not_same"] = sdf["column_a"].isnot(sdf["column_b"]) -``` - -**Arguments**: - -- `other`: object to check for "is_not" - -**Returns**: - -new StreamingSeries - - + -#### StreamingSeries.isnull +#### StreamingDataFrame.tumbling\_window ```python -def isnull() -> Self +def tumbling_window(duration_ms: Union[int, timedelta], + grace_ms: Union[int, timedelta] = 0, + name: Optional[str] = None) -> TumblingWindowDefinition ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/series.py#L307) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/dataframe.py#L343) -Check if series value is None. +Create a tumbling window transformation on this StreamingDataFrame. -Runtime result will be a `bool`. +Tumbling windows divide time into fixed-sized, non-overlapping windows. +They allow to perform stateful aggregations like `sum`, `reduce`, etc. +on top of the data and emit results downstream. -Example Snippet: +Notes: -```python -from quixstreams import Application +- Every window is grouped by the current Kafka message key. +- Messages with `None` key will be ignored. +- The time windows always use the current event time. -# Check if "column_a" is null and assign the resulting `bool` to a new column: -# "is_null" -sdf = app.dataframe() -sdf["is_null"] = sdf["column_a"].isnull() -``` -**Returns**: +Example Snippet: -new StreamingSeries +```python +app = Application() +sdf = app.dataframe(...) - +sdf = ( + # Define a tumbling window of 60s and grace period of 10s + sdf.tumbling_window( + duration_ms=timedelta(seconds=60), grace_ms=timedelta(seconds=10.0) + ) -#### StreamingSeries.notnull + # Specify the aggregation function + .sum() -```python -def notnull() -> Self + # Specify how the results should be emitted downstream. + # "all()" will emit results as they come for each updated window, + # possibly producing multiple messages per key-window pair + # "final()" will emit windows only when they are closed and cannot + # receive any updates anymore. + .all() +) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/series.py#L330) - -Check if series value is not None. +**Arguments**: -Runtime result will be a `bool`. +- `duration_ms`: The length of each window. +Can be specified as either an `int` representing milliseconds or a +`timedelta` object. +>***NOTE:*** `timedelta` objects will be rounded to the closest millisecond +value. +- `grace_ms`: The grace period for data arrival. +It allows late-arriving data (data arriving after the window +has theoretically closed) to be included in the window. +Can be specified as either an `int` representing milliseconds +or as a `timedelta` object. +>***NOTE:*** `timedelta` objects will be rounded to the closest millisecond +value. +- `name`: The unique identifier for the window. If not provided, it will be +automatically generated based on the window's properties. +**Returns**: -Example Snippet: +`TumblingWindowDefinition` instance representing the tumbling window +configuration. +This object can be further configured with aggregation functions +like `sum`, `count`, etc. applied to the StreamingDataFrame. -```python -from quixstreams import Application + -# Check if "column_a" is not null and assign the resulting `bool` to a new column: -# "is_not_null" +#### StreamingDataFrame.hopping\_window -sdf = app.dataframe() -sdf["is_not_null"] = sdf["column_a"].notnull() +```python +def hopping_window(duration_ms: Union[int, timedelta], + step_ms: Union[int, timedelta], + grace_ms: Union[int, timedelta] = 0, + name: Optional[str] = None) -> HoppingWindowDefinition ``` -**Returns**: +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/dataframe.py#L418) -new StreamingSeries +Create a hopping window transformation on this StreamingDataFrame. - +Hopping windows divide the data stream into overlapping windows based on time. +The overlap is controlled by the `step_ms` parameter. -#### StreamingSeries.abs +They allow to perform stateful aggregations like `sum`, `reduce`, etc. +on top of the data and emit results downstream. -```python -def abs() -> Self -``` +Notes: -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/series.py#L353) +- Every window is grouped by the current Kafka message key. +- Messages with `None` key will be ignored. +- The time windows always use the current event time. -Get absolute value of the series value. Example Snippet: ```python -from quixstreams import Application - -# Get absolute value of "int_col" and add it to "other_int_col". -# Finally, assign the result to a new column: "abs_col_sum". +app = Application() +sdf = app.dataframe(...) -sdf = app.dataframe() -sdf["abs_col_sum"] = sdf["int_col"].abs() + sdf["other_int_col"] -``` +sdf = ( + # Define a hopping window of 60s with step 30s and grace period of 10s + sdf.hopping_window( + duration_ms=timedelta(seconds=60), + step_ms=timedelta(seconds=30), + grace_ms=timedelta(seconds=10) + ) -**Returns**: + # Specify the aggregation function + .sum() -new StreamingSeries + # Specify how the results should be emitted downstream. + # "all()" will emit results as they come for each updated window, + # possibly producing multiple messages per key-window pair + # "final()" will emit windows only when they are closed and cannot + # receive any updates anymore. + .all() +) +``` - +**Arguments**: -## quixstreams.dataframe.base +- `duration_ms`: The length of each window. It defines the time span for +which each window aggregates data. +Can be specified as either an `int` representing milliseconds +or a `timedelta` object. +>***NOTE:*** `timedelta` objects will be rounded to the closest millisecond +value. +- `step_ms`: The step size for the window. +It determines how much each successive window moves forward in time. +Can be specified as either an `int` representing milliseconds +or a `timedelta` object. +>***NOTE:*** `timedelta` objects will be rounded to the closest millisecond +value. +- `grace_ms`: The grace period for data arrival. +It allows late-arriving data to be included in the window, +even if it arrives after the window has theoretically moved forward. +Can be specified as either an `int` representing milliseconds +or a `timedelta` object. +>***NOTE:*** `timedelta` objects will be rounded to the closest millisecond +value. +- `name`: The unique identifier for the window. If not provided, it will be +automatically generated based on the window's properties. - +**Returns**: -## quixstreams.dataframe.exceptions +`HoppingWindowDefinition` instance representing the hopping +window configuration. +This object can be further configured with aggregation functions +like `sum`, `count`, etc. and applied to the StreamingDataFrame. - + -## quixstreams.dataframe.dataframe +## quixstreams.dataframe.series - + -### StreamingDataFrame +### StreamingSeries ```python -class StreamingDataFrame(BaseStreaming) +class StreamingSeries(BaseStreaming) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/dataframe.py#L32) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/series.py#L40) -`StreamingDataFrame` is the main object you will use for ETL work. +`StreamingSeries` are typically generated by `StreamingDataframes` when getting +elements from, or performing certain operations on, a `StreamingDataframe`, +thus acting as a representation of "column" value. -Typically created with an `app = quixstreams.app.Application()` instance, -via `sdf = app.dataframe()`. +They share some operations with the `StreamingDataframe`, but also provide some +additional functionality. + +Most column value operations are handled by this class, and `StreamingSeries` can +generate other `StreamingSeries` as a result of said operations. What it Does: -- Builds a data processing pipeline, declaratively (not executed immediately) - - Executes this pipeline on inputs at runtime (Kafka message values) -- Provides functions/interface similar to Pandas Dataframes/Series -- Enables stateful processing (and manages everything related to it) +- Allows ways to do simple operations with dataframe "column"/dictionary values: + - Basic ops like add, subtract, modulo, etc. +- Enables comparisons/inequalities: + - Greater than, equals, etc. + - and/or, is/not operations +- Can check for existence of columns in `StreamingDataFrames` +- Enables chaining of various operations together How to Use: -Define various operations while continuously reassigning to itself (or new fields). - -These operations will generally transform your data, access/update state, or produce -to kafka topics. - -We recommend your data structure to be "columnar" (aka a dict/JSON) in nature so -that it works with the entire interface, but simple types like `ints`, `str`, etc. -are also supported. +For the most part, you may not even notice this class exists! +They will naturally be created as a result of typical `StreamingDataFrame` use. -See the various methods and classes for more specifics, or for a deep dive into -usage, see `streamingdataframe.md` under the `docs/` folder. +Auto-complete should help you with valid methods and type-checking should alert +you to invalid operations between `StreamingSeries`. ->***NOTE:*** column referencing like `sdf["a_column"]` and various methods often - create other object types (typically `quixstreams.dataframe.StreamingSeries`), - which is expected; type hinting should alert you to any issues should you - attempt invalid operations with said objects (however, we cannot infer whether - an operation is valid with respect to your data!). +In general, any typical Pands dataframe operation between columns should be valid +with `StreamingSeries`, and you shouldn't have to think about them explicitly. Example Snippet: ```python +# Random methods for example purposes. More detailed explanations found under +# various methods or in the docs folder. + sdf = StreamingDataframe() -sdf = sdf.apply(a_func) -sdf = sdf.filter(another_func) -sdf = sdf.to_topic(topic_obj) +sdf = sdf["column_a"].apply(a_func).apply(diff_func, stateful=True) +sdf["my_new_bool_field"] = sdf["column_b"].contains("this_string") +sdf["new_sum_field"] = sdf["column_c"] + sdf["column_d"] + 2 +sdf = sdf[["column_a"] & (sdf["new_sum_field"] >= 10)] ``` - + -#### StreamingDataFrame.apply +#### StreamingSeries.from\_func ```python -def apply(func: Union[DataFrameFunc, DataFrameStatefulFunc], - stateful: bool = False, - expand: bool = False) -> Self +@classmethod +def from_func(cls, func: StreamCallable) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/dataframe.py#L109) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/series.py#L100) -Apply a function to transform the value and return a new value. +Create a StreamingSeries from a function. -The result will be passed downstream as an input value. +The provided function will be wrapped into `Apply` +**Arguments**: -Example Snippet: +- `func`: a function to apply -```python -# This stores a string in state and capitalizes every column with a string value. -# A second apply then keeps only the string value columns (shows non-stateful). -def func(d: dict, state: State): - value = d["store_field"] - if value != state.get("my_store_key"): - state.set("my_store_key") = value - return {k: v.upper() if isinstance(v, str) else v for k, v in d.items()} - -sdf = StreamingDataframe() -sdf = sdf.apply(func, stateful=True) -sdf = sdf.apply(lambda d: {k: v for k,v in d.items() if isinstance(v, str)}) - -``` - -**Arguments**: +**Returns**: -- `func`: a function to apply -- `stateful`: if `True`, the function will be provided with a second argument -of type `State` to perform stateful operations. -- `expand`: if True, expand the returned iterable into individual values -downstream. If returned value is not iterable, `TypeError` will be raised. -Default - `False`. +instance of `StreamingSeries` - + -#### StreamingDataFrame.update +#### StreamingSeries.apply ```python -def update(func: Union[DataFrameFunc, DataFrameStatefulFunc], - stateful: bool = False) -> Self +def apply(func: StreamCallable) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/dataframe.py#L152) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/series.py#L114) -Apply a function to mutate value in-place or to perform a side effect +Add a callable to the execution list for this series. -that doesn't update the value (e.g. print a value to the console). +The provided callable should accept a single argument, which will be its input. +The provided callable should similarly return one output, or None -The result of the function will be ignored, and the original value will be -passed downstream. +They can be chained together or included with other operations. Example Snippet: ```python -# Stores a value and mutates a list by appending a new item to it. -# Also prints to console. +# The `StreamingSeries` are generated when `sdf["COLUMN_NAME"]` is called. +# This stores a string in state and capitalizes the column value; the result is +# assigned to a new column. +# Another apply converts a str column to an int, assigning it to a new column. -def func(values: list, state: State): - value = values[0] +def func(value: str, state: State): if value != state.get("my_store_key"): state.set("my_store_key") = value - values.append("new_item") + return v.upper() sdf = StreamingDataframe() -sdf = sdf.update(func, stateful=True) -sdf = sdf.update(lambda value: print("Received value: ", value)) +sdf["new_col"] = sdf["a_column"]["nested_dict_key"].apply(func, stateful=True) +sdf["new_col_2"] = sdf["str_col"].apply(lambda v: int(v)) + sdf["str_col2"] + 2 ``` **Arguments**: -- `func`: function to update value -- `stateful`: if `True`, the function will be provided with a second argument -of type `State` to perform stateful operations. +- `func`: a callable with one argument and one output - +**Returns**: -#### StreamingDataFrame.filter +a new `StreamingSeries` with the new callable added + + + +#### StreamingSeries.compose ```python -def filter(func: Union[DataFrameFunc, DataFrameStatefulFunc], - stateful: bool = False) -> Self +def compose(allow_filters: bool = True, + allow_updates: bool = True) -> StreamCallable ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/dataframe.py#L191) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/series.py#L148) -Filter value using provided function. +Compose all functions of this StreamingSeries into one big closure. -If the function returns True-like value, the original value will be -passed downstream. -Otherwise, the `Filtered` exception will be raised (further processing for that -message will be skipped). +Closures are more performant than calling all the functions in the +`StreamingDataFrame` one-by-one. + +Generally not required by users; the `quixstreams.app.Application` class will +do this automatically. Example Snippet: ```python -# Stores a value and allows further processing only if the value is greater than -# what was previously stored. +from quixstreams import Application -def func(d: dict, state: State): - value = d["my_value"] - if value > state.get("my_store_key"): - state.set("my_store_key") = value - return True - return False +app = Application(...) -sdf = StreamingDataframe() -sdf = sdf.filter(func, stateful=True) +sdf = app.dataframe() +sdf = sdf["column_a"].apply(apply_func) +sdf = sdf["column_b"].contains(filter_func) +sdf = sdf.compose() + +result_0 = sdf({"my": "record"}) +result_1 = sdf({"other": "record"}) ``` **Arguments**: -- `func`: function to filter value -- `stateful`: if `True`, the function will be provided with second argument -of type `State` to perform stateful operations. +- `allow_filters`: If False, this function will fail with ValueError if +the stream has filter functions in the tree. Default - True. +- `allow_updates`: If False, this function will fail with ValueError if +the stream has update functions in the tree. Default - True. - +**Raises**: -#### StreamingDataFrame.contains +- `ValueError`: if disallowed functions are present in the tree of +underlying `Stream`. -```python -@staticmethod -def contains(key: str) -> StreamingSeries -``` +**Returns**: -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/dataframe.py#L244) +a function that accepts "value" +and returns a result of `StreamingSeries` -Check if the key is present in the Row value. + -Example Snippet: +#### StreamingSeries.test ```python -# Add new column 'has_column' which contains a boolean indicating -# the presence of 'column_x' - -sdf = StreamingDataframe() -sdf['has_column'] = sdf.contains('column_x') +def test(value: Any, ctx: Optional[MessageContext] = None) -> Any ``` +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/series.py#L195) + +A shorthand to test `StreamingSeries` with provided value + +and `MessageContext`. + **Arguments**: -- `key`: a column name to check. +- `value`: value to pass through `StreamingSeries` +- `ctx`: instance of `MessageContext`, optional. +Provide it if the StreamingSeries instance has +functions calling `get_current_key()`. +Default - `None`. **Returns**: -a Column object that evaluates to True if the key is present -or False otherwise. +result of `StreamingSeries` - + -#### StreamingDataFrame.to\_topic +#### StreamingSeries.isin ```python -def to_topic(topic: Topic, - key: Optional[Callable[[object], object]] = None) -> Self +def isin(other: Container) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/dataframe.py#L267) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/series.py#L231) -Produce current value to a topic. You can optionally specify a new key. +Check if series value is in "other". ->***NOTE:*** A `RowProducer` instance must be assigned to -`StreamingDataFrame.producer` if not using :class:`quixstreams.app.Application` - to facilitate the execution of StreamingDataFrame. +Same as "StreamingSeries in other". + +Runtime result will be a `bool`. Example Snippet: @@ -1391,1151 +1369,1302 @@ Example Snippet: ```python from quixstreams import Application -# Produce to two different topics, changing the key for one of them. - -app = Application() -input_topic = app.topic("input_x") -output_topic_0 = app.topic("output_a") -output_topic_1 = app.topic("output_b") +# Check if "str_column" is contained in a column with a list of strings and +# assign the resulting `bool` to a new column: "has_my_str". -sdf = app.dataframe(input_topic) -sdf = sdf.to_topic(output_topic_0) -sdf = sdf.to_topic(output_topic_1, key=lambda data: data["a_field"]) +sdf = app.dataframe() +sdf["has_my_str"] = sdf["str_column"].isin(sdf["column_with_list_of_strs"]) ``` **Arguments**: -- `topic`: instance of `Topic` -- `key`: a callable to generate a new message key, optional. -If passed, the return type of this callable must be serializable -by `key_serializer` defined for this Topic object. -By default, the current message key will be used. +- `other`: a container to check - +**Returns**: -#### StreamingDataFrame.compose +new StreamingSeries + + + +#### StreamingSeries.contains ```python -def compose() -> StreamCallable +def contains(other: Union[Self, object]) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/dataframe.py#L306) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/series.py#L258) -Compose all functions of this StreamingDataFrame into one big closure. +Check if series value contains "other" -Closures are more performant than calling all the functions in the -`StreamingDataFrame` one-by-one. +Same as "other in StreamingSeries". -Generally not required by users; the `quixstreams.app.Application` class will -do this automatically. +Runtime result will be a `bool`. Example Snippet: ```python from quixstreams import Application -sdf = app.dataframe() -sdf = sdf.apply(apply_func) -sdf = sdf.filter(filter_func) -sdf = sdf.compose() -result_0 = sdf({"my": "record"}) -result_1 = sdf({"other": "record"}) +# Check if "column_a" contains "my_substring" and assign the resulting +# `bool` to a new column: "has_my_substr" + +sdf = app.dataframe() +sdf["has_my_substr"] = sdf["column_a"].contains("my_substring") ``` +**Arguments**: + +- `other`: object to check + **Returns**: -a function that accepts "value" -and returns a result of StreamingDataFrame +new StreamingSeries - + -#### StreamingDataFrame.test +#### StreamingSeries.is\_ ```python -def test(value: object, ctx: Optional[MessageContext] = None) -> Any +def is_(other: Union[Self, object]) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/dataframe.py#L336) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/series.py#L283) -A shorthand to test `StreamingDataFrame` with provided value +Check if series value refers to the same object as `other` -and `MessageContext`. +Runtime result will be a `bool`. + + +Example Snippet: + +```python +# Check if "column_a" is the same as "column_b" and assign the resulting `bool` +# to a new column: "is_same" + +from quixstreams import Application +sdf = app.dataframe() +sdf["is_same"] = sdf["column_a"].is_(sdf["column_b"]) +``` **Arguments**: -- `value`: value to pass through `StreamingDataFrame` -- `ctx`: instance of `MessageContext`, optional. -Provide it if the StreamingDataFrame instance calls `to_topic()`, -has stateful functions or functions calling `get_current_key()`. -Default - `None`. +- `other`: object to check for "is" **Returns**: -result of `StreamingDataFrame` +new StreamingSeries - + -#### StreamingDataFrame.tumbling\_window +#### StreamingSeries.isnot ```python -def tumbling_window(duration_ms: Union[int, timedelta], - grace_ms: Union[int, timedelta] = 0, - name: Optional[str] = None) -> TumblingWindowDefinition +def isnot(other: Union[Self, object]) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/dataframe.py#L354) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/series.py#L306) -Create a tumbling window transformation on this StreamingDataFrame. +Check if series value does not refer to the same object as `other` -Tumbling windows divide time into fixed-sized, non-overlapping windows. +Runtime result will be a `bool`. -They allow to perform stateful aggregations like `sum`, `reduce`, etc. -on top of the data and emit results downstream. -Notes: +Example Snippet: -- Every window is grouped by the current Kafka message key. -- Messages with `None` key will be ignored. -- The time windows always use the current event time. +```python +from quixstreams import Application + +# Check if "column_a" is the same as "column_b" and assign the resulting `bool` +# to a new column: "is_not_same" + +sdf = app.dataframe() +sdf["is_not_same"] = sdf["column_a"].isnot(sdf["column_b"]) +``` + +**Arguments**: + +- `other`: object to check for "is_not" + +**Returns**: + +new StreamingSeries + + + +#### StreamingSeries.isnull + +```python +def isnull() -> Self +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/series.py#L330) + +Check if series value is None. +Runtime result will be a `bool`. Example Snippet: ```python -app = Application() -sdf = app.dataframe(...) +from quixstreams import Application -sdf = ( - # Define a tumbling window of 60s and grace period of 10s - sdf.tumbling_window( - duration_ms=timedelta(seconds=60), grace_ms=timedelta(seconds=10.0) - ) +# Check if "column_a" is null and assign the resulting `bool` to a new column: +# "is_null" - # Specify the aggregation function - .sum() +sdf = app.dataframe() +sdf["is_null"] = sdf["column_a"].isnull() +``` - # Specify how the results should be emitted downstream. - # "all()" will emit results as they come for each updated window, - # possibly producing multiple messages per key-window pair - # "final()" will emit windows only when they are closed and cannot - # receive any updates anymore. - .all() -) +**Returns**: + +new StreamingSeries + + + +#### StreamingSeries.notnull + +```python +def notnull() -> Self +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/series.py#L353) + +Check if series value is not None. + +Runtime result will be a `bool`. + + +Example Snippet: + +```python +from quixstreams import Application + +# Check if "column_a" is not null and assign the resulting `bool` to a new column: +# "is_not_null" + +sdf = app.dataframe() +sdf["is_not_null"] = sdf["column_a"].notnull() +``` + +**Returns**: + +new StreamingSeries + + + +#### StreamingSeries.abs + +```python +def abs() -> Self +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/series.py#L376) + +Get absolute value of the series value. + +Example Snippet: + +```python +from quixstreams import Application + +# Get absolute value of "int_col" and add it to "other_int_col". +# Finally, assign the result to a new column: "abs_col_sum". + +sdf = app.dataframe() +sdf["abs_col_sum"] = sdf["int_col"].abs() + sdf["other_int_col"] +``` + +**Returns**: + +new StreamingSeries + + + +## quixstreams.dataframe + + + +## quixstreams.dataframe.utils + + + +#### ensure\_milliseconds + +```python +def ensure_milliseconds(delta: Union[int, timedelta]) -> int ``` +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/utils.py#L5) + +Convert timedelta to milliseconds. + +If the `delta` is not +This function will also round the value to the closest milliseconds in case of +higher precision. + **Arguments**: -- `duration_ms`: The length of each window. -Can be specified as either an `int` representing milliseconds or a -`timedelta` object. ->***NOTE:*** `timedelta` objects will be rounded to the closest millisecond -value. -- `grace_ms`: The grace period for data arrival. -It allows late-arriving data (data arriving after the window -has theoretically closed) to be included in the window. -Can be specified as either an `int` representing milliseconds -or as a `timedelta` object. ->***NOTE:*** `timedelta` objects will be rounded to the closest millisecond -value. -- `name`: The unique identifier for the window. If not provided, it will be -automatically generated based on the window's properties. +- `delta`: `timedelta` object **Returns**: -`TumblingWindowDefinition` instance representing the tumbling window -configuration. -This object can be further configured with aggregation functions -like `sum`, `count`, etc. applied to the StreamingDataFrame. +timedelta value in milliseconds as `int` - + -#### StreamingDataFrame.hopping\_window +## quixstreams.dataframe.exceptions + + + +## quixstreams.dataframe.windows.definitions + + + +### FixedTimeWindowDefinition ```python -def hopping_window(duration_ms: Union[int, timedelta], - step_ms: Union[int, timedelta], - grace_ms: Union[int, timedelta] = 0, - name: Optional[str] = None) -> HoppingWindowDefinition +class FixedTimeWindowDefinition(abc.ABC) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/dataframe/dataframe.py#L429) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/windows/definitions.py#L20) -Create a hopping window transformation on this StreamingDataFrame. + -Hopping windows divide the data stream into overlapping windows based on time. -The overlap is controlled by the `step_ms` parameter. +#### FixedTimeWindowDefinition.sum -They allow to perform stateful aggregations like `sum`, `reduce`, etc. -on top of the data and emit results downstream. +```python +def sum() -> "FixedTimeWindow" +``` -Notes: +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/windows/definitions.py#L67) -- Every window is grouped by the current Kafka message key. -- Messages with `None` key will be ignored. -- The time windows always use the current event time. +Configure the window to aggregate data by summing up values within +each window period. -Example Snippet: +**Returns**: + +an instance of `FixedTimeWindow` configured to perform sum aggregation. + + + +#### FixedTimeWindowDefinition.count ```python -app = Application() -sdf = app.dataframe(...) +def count() -> "FixedTimeWindow" +``` -sdf = ( - # Define a hopping window of 60s with step 30s and grace period of 10s - sdf.hopping_window( - duration_ms=timedelta(seconds=60), - step_ms=timedelta(seconds=30), - grace_ms=timedelta(seconds=10) - ) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/windows/definitions.py#L94) - # Specify the aggregation function - .sum() +Configure the window to aggregate data by counting the number of values - # Specify how the results should be emitted downstream. - # "all()" will emit results as they come for each updated window, - # possibly producing multiple messages per key-window pair - # "final()" will emit windows only when they are closed and cannot - # receive any updates anymore. - .all() +within each window period. + +**Returns**: + +an instance of `FixedTimeWindow` configured to perform record count. + + + +#### FixedTimeWindowDefinition.mean + +```python +def mean() -> "FixedTimeWindow" +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/windows/definitions.py#L121) + +Configure the window to aggregate data by calculating the mean of the values + +within each window period. + +**Returns**: + +an instance of `FixedTimeWindow` configured to calculate the mean +of the values. + + + +#### FixedTimeWindowDefinition.reduce + +```python +def reduce(reducer: Callable[[Any, Any], Any], + initializer: Callable[[Any], Any]) -> "FixedTimeWindow" +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/windows/definitions.py#L152) + +Configure the window to perform a custom aggregation using `reducer` + +and `initializer` functions. + +Example Snippet: +```python +sdf = StreamingDataFrame(...) + +# Using "reduce()" to calculate multiple aggregates at once +def reducer(agg: dict, current: int): + aggregated = { + 'min': min(agg['min'], current), + 'max': max(agg['max'], current) + 'count': agg['count'] + 1 + } + return aggregated + +def initializer(current) -> dict: + return {'min': current, 'max': current, 'count': 1} + +window = ( + sdf.tumbling_window(duration_ms=1000) + .reduce(reducer=reducer, initializer=initializer) + .final() ) ``` **Arguments**: -- `duration_ms`: The length of each window. It defines the time span for -which each window aggregates data. -Can be specified as either an `int` representing milliseconds -or a `timedelta` object. ->***NOTE:*** `timedelta` objects will be rounded to the closest millisecond -value. -- `step_ms`: The step size for the window. -It determines how much each successive window moves forward in time. -Can be specified as either an `int` representing milliseconds -or a `timedelta` object. ->***NOTE:*** `timedelta` objects will be rounded to the closest millisecond -value. -- `grace_ms`: The grace period for data arrival. -It allows late-arriving data to be included in the window, -even if it arrives after the window has theoretically moved forward. -Can be specified as either an `int` representing milliseconds -or a `timedelta` object. ->***NOTE:*** `timedelta` objects will be rounded to the closest millisecond -value. -- `name`: The unique identifier for the window. If not provided, it will be -automatically generated based on the window's properties. +- `reducer`: A function that takes two arguments +(the accumulated value and a new value) and returns a single value. +The returned value will be saved to the state store and sent downstream. +- `initializer`: A function to call for every first element of the window. +This function is used to initialize the aggregation within a window. **Returns**: -`HoppingWindowDefinition` instance representing the hopping -window configuration. -This object can be further configured with aggregation functions -like `sum`, `count`, etc. and applied to the StreamingDataFrame. +A window configured to perform custom reduce aggregation on the data. - + -## quixstreams.error\_callbacks +#### FixedTimeWindowDefinition.max - +```python +def max() -> "FixedTimeWindow" +``` -## quixstreams.exceptions +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/windows/definitions.py#L212) - +Configure a window to aggregate the maximum value within each window period. -## quixstreams.exceptions.base +**Returns**: - +an instance of `FixedTimeWindow` configured to calculate the maximum +value within each window period. -## quixstreams.exceptions.assignment + - +#### FixedTimeWindowDefinition.min -### PartitionAssignmentError +```python +def min() -> "FixedTimeWindow" +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/windows/definitions.py#L241) + +Configure a window to aggregate the minimum value within each window period. + +**Returns**: + +an instance of `FixedTimeWindow` configured to calculate the maximum +value within each window period. + + + +## quixstreams.dataframe.windows + + + +## quixstreams.dataframe.windows.time\_based + + + +### FixedTimeWindow ```python -class PartitionAssignmentError(QuixException) +class FixedTimeWindow() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/exceptions/assignment.py#L6) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/windows/time_based.py#L26) -Error happened during partition rebalancing. -Raised from `on_assign`, `on_revoke` and `on_lost` callbacks + - +#### FixedTimeWindow.final -## quixstreams.kafka +```python +def final(expand: bool = True) -> "StreamingDataFrame" +``` - +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/windows/time_based.py#L89) -## quixstreams.kafka.consumer +Apply the window aggregation and return results only when the windows are - +closed. -### Consumer +The format of returned windows: +```python +{ + "start": , + "end": , + "value: , +} +``` + +The individual window is closed when the event time +(the maximum observed timestamp across the partition) passes +its end timestamp + grace period. +The closed windows cannot receive updates anymore and are considered final. + +>***NOTE:*** Windows can be closed only within the same message key. +If some message keys appear irregularly in the stream, the latest windows +can remain unprocessed until the message the same key is received. + +**Arguments**: + +- `expand`: if `True`, each window result will be sent downstream as +an individual item. Otherwise, the list of window results will be sent. +Default - `True` + + + +#### FixedTimeWindow.current ```python -class Consumer() +def current(expand: bool = True) -> "StreamingDataFrame" ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L66) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/windows/time_based.py#L126) - +Apply the window transformation to the StreamingDataFrame to return results -#### Consumer.\_\_init\_\_ +for each updated window. +The format of returned windows: ```python -def __init__(broker_address: str, - consumer_group: Optional[str], - auto_offset_reset: AutoOffsetReset, - auto_commit_enable: bool = True, - assignment_strategy: AssignmentStrategy = "range", - on_commit: Optional[Callable[ - [Optional[KafkaError], List[TopicPartition]], None]] = None, - extra_config: Optional[dict] = None) +{ + "start": , + "end": , + "value: , +} ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L67) +This method processes streaming data and returns results as they come, +regardless of whether the window is closed or not. -A wrapper around `confluent_kafka.Consumer`. +**Arguments**: -It initializes `confluent_kafka.Consumer` on demand -avoiding network calls during `__init__`, provides typing info for methods -and some reasonable defaults. +- `expand`: if `True`, each window result will be sent downstream as +an individual item. Otherwise, the list of window results will be sent. +Default - `True` + + + +## quixstreams.dataframe.windows.base + + + +#### get\_window\_ranges + +```python +def get_window_ranges(timestamp_ms: int, + duration_ms: int, + step_ms: Optional[int] = None) -> List[Tuple[int, int]] +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/dataframe/windows/base.py#L22) + +Get a list of window ranges for the given timestamp. **Arguments**: -- `broker_address`: Kafka broker host and port in format `:`. -Passed as `bootstrap.servers` to `confluent_kafka.Consumer`. -- `consumer_group`: Kafka consumer group. -Passed as `group.id` to `confluent_kafka.Consumer` -- `auto_offset_reset`: Consumer `auto.offset.reset` setting. -Available values: -- "earliest" - automatically reset the offset to the smallest offset -- "latest" - automatically reset the offset to the largest offset -- "error" - trigger an error (ERR__AUTO_OFFSET_RESET) which is retrieved - by consuming messages (used for testing) -- `auto_commit_enable`: If true, periodically commit offset of -the last message handed to the application. Default - `True`. -- `assignment_strategy`: The name of a partition assignment strategy. -Available values: "range", "roundrobin", "cooperative-sticky". -- `on_commit`: Offset commit result propagation callback. -Passed as "offset_commit_cb" to `confluent_kafka.Consumer`. -- `extra_config`: A dictionary with additional options that -will be passed to `confluent_kafka.Consumer` as is. -Note: values passed as arguments override values in `extra_config`. +- `timestamp_ms`: timestamp in milliseconds +- `duration_ms`: window duration in milliseconds +- `step_ms`: window step in milliseconds for hopping windows, optional. + +**Returns**: - +a list of (, ) tuples -#### Consumer.poll + -```python -def poll(timeout: Optional[float] = None) -> Optional[Message] -``` +## quixstreams.dataframe.base -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L126) + -Consumes a single message, calls callbacks and returns events. +## quixstreams.rowproducer -The application must check the returned :py:class:`Message` -object's :py:func:`Message.error()` method to distinguish between proper -messages (error() returns None), or an event or error. + -Note: Callbacks may be called from this method, such as -``on_assign``, ``on_revoke``, et al. +### RowProducer -**Arguments**: +```python +class RowProducer() +``` -- `timeout` (`float`): Maximum time in seconds to block waiting for message, -event or callback. None or -1 is infinite. Default: None. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/rowproducer.py#L14) -**Raises**: +A producer class that is capable of serializing Rows to bytes and send them to Kafka. -- `None`: RuntimeError if called on a closed consumer +The serialization is performed according to the Topic serialization settings. -**Returns**: +**Arguments**: -A Message object or None on timeout +- `broker_address`: Kafka broker host and port in format `:`. +Passed as `bootstrap.servers` to `confluent_kafka.Producer`. +- `partitioner`: A function to be used to determine the outgoing message +partition. +Available values: "random", "consistent_random", "murmur2", "murmur2_random", +"fnv1a", "fnv1a_random" +Default - "murmur2". +- `extra_config`: A dictionary with additional options that +will be passed to `confluent_kafka.Producer` as is. +Note: values passed as arguments override values in `extra_config`. +- `on_error`: a callback triggered when `RowProducer.produce_row()` +or `RowProducer.poll()` fail`. +If producer fails and the callback returns `True`, the exception +will be logged but not propagated. +The default callback logs an exception and returns `False`. - + -#### Consumer.subscribe +#### RowProducer.produce\_row ```python -def subscribe(topics: List[str], - on_assign: Optional[RebalancingCallback] = None, - on_revoke: Optional[RebalancingCallback] = None, - on_lost: Optional[RebalancingCallback] = None) +def produce_row(row: Row, + topic: Topic, + key: Optional[Any] = None, + partition: Optional[int] = None, + timestamp: Optional[int] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L144) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/rowproducer.py#L55) -Set subscription to supplied list of topics +Serialize Row to bytes according to the Topic serialization settings -This replaces a previous subscription. +and produce it to Kafka + +If this method fails, it will trigger the provided "on_error" callback. **Arguments**: -- `topics` (`list(str)`): List of topics (strings) to subscribe to. -- `on_assign` (`callable`): callback to provide handling of customized offsets -on completion of a successful partition re-assignment. -- `on_revoke` (`callable`): callback to provide handling of offset commits to -a customized store on the start of a rebalance operation. -- `on_lost` (`callable`): callback to provide handling in the case the partition -assignment has been lost. Partitions that have been lost may already be -owned by other members in the group and therefore committing offsets, -for example, may fail. +- `row`: Row object +- `topic`: Topic object +- `key`: message key, optional +- `partition`: partition number, optional +- `timestamp`: timestamp in milliseconds, optional -**Raises**: + -- `KafkaException`: -- `None`: RuntimeError if called on a closed consumer -.. py:function:: on_assign(consumer, partitions) -.. py:function:: on_revoke(consumer, partitions) -.. py:function:: on_lost(consumer, partitions) +#### RowProducer.poll - :param Consumer consumer: Consumer instance. - :param list(TopicPartition) partitions: Absolute list of partitions being - assigned or revoked. +```python +def poll(timeout: float = None) +``` - +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/rowproducer.py#L92) -#### Consumer.unsubscribe +Polls the producer for events and calls `on_delivery` callbacks. -```python -def unsubscribe() -``` +If `poll()` fails, it will trigger the provided "on_error" callback -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L238) +**Arguments**: -Remove current subscription. +- `timeout`: timeout in seconds -**Raises**: + -- `None`: KafkaException -- `None`: RuntimeError if called on a closed consumer +## quixstreams.core.stream.functions - + -#### Consumer.store\_offsets +### StreamFunction ```python -def store_offsets(message: Optional[Message] = None, - offsets: Optional[List[TopicPartition]] = None) +class StreamFunction(abc.ABC) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L246) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/core/stream/functions.py#L26) -.. py:function:: store_offsets([message=None], [offsets=None]) +A base class for all the streaming operations in Quix Streams. -Store offsets for a message or a list of offsets. +It provides two methods that return closures to be called on the input values: +- `get_executor` - a wrapper to execute on a single value +- `get_executor_expanded` - a wrapper to execute on an expanded value. + Expanded value is a list, where each item should be treated as a separate value. -``message`` and ``offsets`` are mutually exclusive. The stored offsets -will be committed according to 'auto.commit.interval.ms' or manual -offset-less `commit`. -Note that 'enable.auto.offset.store' must be set to False when using this API. + -**Arguments**: +#### StreamFunction.func -- `message` (`confluent_kafka.Message`): Store message's offset+1. -- `offsets` (`list(TopicPartition)`): List of topic+partitions+offsets to store. +```python +@property +def func() -> StreamCallable +``` -**Raises**: +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/core/stream/functions.py#L43) -- `None`: KafkaException -- `None`: RuntimeError if called on a closed consumer +The original function - + -#### Consumer.commit +#### StreamFunction.get\_executor ```python -def commit(message: Optional[Message] = None, - offsets: Optional[List[TopicPartition]] = None, - asynchronous: bool = True) -> Optional[List[TopicPartition]] +@abc.abstractmethod +def get_executor() -> StreamCallable ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L280) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/core/stream/functions.py#L50) -Commit a message or a list of offsets. +Returns a wrapper to be called on a single value. -The ``message`` and ``offsets`` parameters are mutually exclusive. -If neither is set, the current partition assignment's offsets are used instead. -Use this method to commit offsets if you have 'enable.auto.commit' set to False. + -**Arguments**: +#### StreamFunction.get\_executor\_expanded -- `message` (`confluent_kafka.Message`): Commit the message's offset+1. -Note: By convention, committed offsets reflect the next message -to be consumed, **not** the last message consumed. -- `offsets` (`list(TopicPartition)`): List of topic+partitions+offsets to commit. -- `asynchronous` (`bool`): If true, asynchronously commit, returning None -immediately. If False, the commit() call will block until the commit -succeeds or fails and the committed offsets will be returned (on success). -Note that specific partitions may have failed and the .err field of -each partition should be checked for success. +```python +@abc.abstractmethod +def get_executor_expanded() -> StreamCallable +``` -**Raises**: +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/core/stream/functions.py#L56) -- `None`: KafkaException -- `None`: RuntimeError if called on a closed consumer +Returns a wrapper to be called on a list of expanded values. - + -#### Consumer.committed +### ApplyFunction ```python -def committed(partitions: List[TopicPartition], - timeout: Optional[float] = None) -> List[TopicPartition] +class ApplyFunction(StreamFunction) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L320) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/core/stream/functions.py#L62) -.. py:function:: committed(partitions, [timeout=None]) +Wrap a function into "Apply" function. -Retrieve committed offsets for the specified partitions. +The provided function is expected to return a new value based on input, +and its result will always be passed downstream. -**Arguments**: + -- `partitions` (`list(TopicPartition)`): List of topic+partitions to query for stored offsets. -- `timeout` (`float`): Request timeout (seconds). -None or -1 is infinite. Default: None +### ApplyExpandFunction -**Raises**: +```python +class ApplyExpandFunction(StreamFunction) +``` -- `None`: KafkaException -- `None`: RuntimeError if called on a closed consumer +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/core/stream/functions.py#L85) -**Returns**: +Wrap a function into "Apply" function and expand the returned iterable +into separate values downstream. -`list(TopicPartition)`: List of topic+partitions with offset and possibly error set. +The provided function is expected to return an `Iterable`. +If the returned value is not `Iterable`, `TypeError` will be raised. - + -#### Consumer.get\_watermark\_offsets +### FilterFunction ```python -def get_watermark_offsets(partition: TopicPartition, - timeout: Optional[float] = None, - cached: bool = False) -> Tuple[int, int] +class FilterFunction(StreamFunction) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L340) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/core/stream/functions.py#L114) -Retrieve low and high offsets for the specified partition. +Wraps a function into a "Filter" function. +The result of a Filter function is interpreted as boolean. +If it's `True`, the input will be return downstream. +If it's `False`, the `Filtered` exception will be raised to signal that the +value is filtered out. -**Arguments**: + -- `partition` (`TopicPartition`): Topic+partition to return offsets for. -- `timeout` (`float`): Request timeout (seconds). None or -1 is infinite. -Ignored if cached=True. Default: None -- `cached` (`bool`): Instead of querying the broker, use cached information. -Cached values: The low offset is updated periodically -(if statistics.interval.ms is set) while the high offset is updated on each -message fetched from the broker for this partition. +### UpdateFunction -**Raises**: +```python +class UpdateFunction(StreamFunction) +``` -- `None`: KafkaException -- `None`: RuntimeError if called on a closed consumer +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/core/stream/functions.py#L146) -**Returns**: +Wrap a function into an "Update" function. -`tuple(int,int)`: Tuple of (low,high) on success or None on timeout. -The high offset is the offset of the last message + 1. +The provided function is expected to mutate the value +or to perform some side effect. +Its result will always be ignored, and its input is passed +downstream. - + -#### Consumer.list\_topics +#### compose ```python -def list_topics(topic: Optional[str] = None, - timeout: Optional[float] = None) -> ClusterMetadata +def compose(functions: List[StreamFunction], + allow_filters: bool = True, + allow_updates: bool = True, + allow_expands: bool = True) -> StreamCallable ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L366) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/core/stream/functions.py#L175) -.. py:function:: list_topics([topic=None], [timeout=-1]) +Composes a list of functions and its parents into a single -Request metadata from the cluster. -This method provides the same information as -listTopics(), describeTopics() and describeCluster() in the Java Admin client. +big closure like this: +``` +[func, func, func] -> func(func(func())) +``` + +Closures are more performant than calling all functions one by one in a loop. **Arguments**: -- `topic` (`str`): If specified, only request information about this topic, -else return results for all topics in cluster. -Warning: If auto.create.topics.enable is set to true on the broker and -an unknown topic is specified, it will be created. -- `timeout` (`float`): The maximum response time before timing out -None or -1 is infinite. Default: None +- `functions`: list of `StreamFunction` objects to compose +- `allow_filters`: If False, will fail with `ValueError` if +the list has `FilterFunction`. Default - True. +- `allow_updates`: If False, will fail with `ValueError` if +the list has `UpdateFunction`. Default - True. +- `allow_expands`: If False, will fail with `ValueError` if +the list has `ApplyFunction` with "expand=True". Default - True. **Raises**: -- `None`: KafkaException +- `ValueError`: if disallowed functions are present in the list of functions. - + -#### Consumer.memberid +#### composer ```python -def memberid() -> str +def composer(outer_func: StreamCallable, + inner_func: StreamCallable) -> Callable[[T], R] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L389) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/core/stream/functions.py#L227) -Return this client's broker-assigned group member id. +A function that wraps two other functions into a closure. -The member id is assigned by the group coordinator and is propagated to -the consumer during rebalance. +It passes the result of the inner function as an input to the outer function. - :returns: Member id string or None - :rtype: string - :raises: RuntimeError if called on a closed consumer +**Returns**: +a function with one argument (value) - + -#### Consumer.offsets\_for\_times +## quixstreams.core.stream -```python -def offsets_for_times(partitions: List[TopicPartition], - timeout: Optional[float] = None) -> List[TopicPartition] -``` + -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L402) +## quixstreams.core.stream.stream -Look up offsets by timestamp for the specified partitions. + -The returned offset for each partition is the earliest offset whose -timestamp is greater than or equal to the given timestamp in the -corresponding partition. If the provided timestamp exceeds that of the -last message in the partition, a value of -1 will be returned. +### Stream - :param list(TopicPartition) partitions: topic+partitions with timestamps - in the TopicPartition.offset field. - :param float timeout: The maximum response time before timing out. - None or -1 is infinite. Default: None - :returns: List of topic+partition with offset field set and possibly error set - :rtype: list(TopicPartition) - :raises: KafkaException - :raises: RuntimeError if called on a closed consumer +```python +class Stream() +``` +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/core/stream/stream.py#L22) - + -#### Consumer.pause +#### Stream.\_\_init\_\_ ```python -def pause(partitions: List[TopicPartition]) +def __init__(func: Optional[StreamFunction] = None, + parent: Optional[Self] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L428) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/core/stream/stream.py#L23) -Pause consumption for the provided list of partitions. - -Paused partitions must be tracked manually. +A base class for all streaming operations. -Does NOT affect the result of Consumer.assignment(). +`Stream` is an abstraction of a function pipeline. +Each Stream has a function and a parent (None by default). +When adding new function to the stream, it creates a new `Stream` object and +sets "parent" to the previous `Stream` to maintain an order of execution. -**Arguments**: +Streams supports 3 types of functions: +- "Apply" - generate new values based on a previous one. + The result of an Apply function is passed downstream to the next functions. + If "expand=True" is passed and the function returns an `Iterable`, + each item of it will be treated as a separate value downstream. +- "Update" - update values in-place. + The result of an Update function is always ignored, and its input is passed + downstream. +- "Filter" - to filter values from the Stream. + The result of a Filter function is interpreted as boolean. + If it's `True`, the input will be passed downstream. + If it's `False`, the `Filtered` exception will be raised to signal that the + value is filtered out. -- `partitions` (`list(TopicPartition)`): List of topic+partitions to pause. +To execute the functions on the `Stream`, call `.compose()` method, and +it will return a closure to execute all the functions accumulated in the Stream +and its parents. -**Raises**: +**Arguments**: -- `None`: KafkaException +- `func`: a function to be called on the stream. +It is expected to be wrapped into one of "Apply", "Filter" or "Update" from +`quixstreams.core.stream.functions` package. +Default - "Apply(lambda v: v)". +- `parent`: a parent `Stream` - + -#### Consumer.resume +#### Stream.add\_filter ```python -def resume(partitions: List[TopicPartition]) +def add_filter(func: Callable[[T], R]) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L442) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/core/stream/stream.py#L79) -.. py:function:: resume(partitions) +Add a function to filter values from the Stream. -Resume consumption for the provided list of partitions. +The return value of the function will be interpreted as `bool`. +If the function returns `False`-like result, the Stream will raise `Filtered` +exception during execution. **Arguments**: -- `partitions` (`list(TopicPartition)`): List of topic+partitions to resume. +- `func`: a function to filter values from the stream -**Raises**: +**Returns**: -- `None`: KafkaException +a new `Stream` derived from the current one - + -#### Consumer.position +#### Stream.add\_apply ```python -def position(partitions: List[TopicPartition]) -> List[TopicPartition] +def add_apply(func: Callable[[T], R], expand: bool = False) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L454) - -Retrieve current positions (offsets) for the specified partitions. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/core/stream/stream.py#L92) -**Arguments**: +Add an "apply" function to the Stream. -- `partitions` (`list(TopicPartition)`): List of topic+partitions to return -current offsets for. The current offset is the offset of -the last consumed message + 1. +The function is supposed to return a new value, which will be passed +further during execution. -**Raises**: +**Arguments**: -- `None`: KafkaException -- `None`: RuntimeError if called on a closed consumer +- `func`: a function to generate a new value +- `expand`: if True, expand the returned iterable into individual values +downstream. If returned value is not iterable, `TypeError` will be raised. +Default - `False`. **Returns**: -`list(TopicPartition)`: List of topic+partitions with offset and possibly error set. +a new `Stream` derived from the current one - + -#### Consumer.seek +#### Stream.add\_update ```python -def seek(partition: TopicPartition) +def add_update(func: Callable[[T], object]) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L468) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/core/stream/stream.py#L109) -Set consume position for partition to offset. - -The offset may be an absolute (>=0) or a -logical offset (:py:const:`OFFSET_BEGINNING` et.al). +Add an "update" function to the Stream, that will mutate the input value. -seek() may only be used to update the consume offset of an -actively consumed partition (i.e., after :py:const:`assign()`), -to set the starting offset of partition not being consumed instead -pass the offset in an `assign()` call. +The return of this function will be ignored and its input +will be passed downstream. **Arguments**: -- `partition` (`TopicPartition`): Topic+partition+offset to seek to. +- `func`: a function to mutate the value -**Raises**: +**Returns**: -- `None`: KafkaException +a new Stream derived from the current one - + -#### Consumer.assignment +#### Stream.diff ```python -def assignment() -> List[TopicPartition] +def diff(other: "Stream") -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L485) - -Returns the current partition assignment. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/core/stream/stream.py#L121) -**Raises**: +Takes the difference between Streams `self` and `other` based on their last -- `None`: KafkaException -- `None`: RuntimeError if called on a closed consumer +common parent, and returns a new `Stream` that includes only this difference. -**Returns**: +It's impossible to calculate a diff when: + - Streams don't have a common parent. + - When the `self` Stream already includes all the nodes from + the `other` Stream, and the resulting diff is empty. -`list(TopicPartition)`: List of assigned topic+partitions. +**Arguments**: - +- `other`: a `Stream` to take a diff from. -#### Consumer.set\_sasl\_credentials +**Raises**: -```python -def set_sasl_credentials(username: str, password: str) -``` +- `ValueError`: if Streams don't have a common parent +or if the diff is empty. -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L498) +**Returns**: -Sets the SASL credentials used for this client. -These credentials will overwrite the old ones, and will be used the next -time the client needs to authenticate. -This method will not disconnect existing broker connections that have been -established with the old credentials. -This method is applicable only to SASL PLAIN and SCRAM mechanisms. +new `Stream` instance including all the Streams from the diff - + -#### Consumer.incremental\_assign +#### Stream.tree ```python -def incremental_assign(partitions: List[TopicPartition]) +def tree() -> List[Self] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L510) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/core/stream/stream.py#L150) -Assign new partitions. +Return a list of all parent Streams including the node itself. -Can be called outside the `Consumer` `on_assign` callback (multiple times). -Partitions immediately show on `Consumer.assignment()`. +The tree is ordered from child to parent (current node comes first). -Any additional partitions besides the ones passed during the `Consumer` -`on_assign` callback will NOT be associated with the consumer group. +**Returns**: - +a list of `Stream` objects -#### Consumer.incremental\_unassign + + +#### Stream.compose ```python -def incremental_unassign(partitions: List[TopicPartition]) +def compose(allow_filters: bool = True, + allow_updates: bool = True, + allow_expands: bool = True) -> Callable[[T], R] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L522) - -Revoke partitions. - -Can be called outside an on_revoke callback. - - +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/core/stream/stream.py#L164) -#### Consumer.close +Compose a list of functions from this `Stream` and its parents into one -```python -def close() -``` +big closure using a "composer" function. -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/consumer.py#L530) +Closures are more performant than calling all the functions in the +`Stream.tree()` one-by-one. -Close down and terminate the Kafka Consumer. +**Arguments**: -Actions performed: +- `allow_filters`: If False, this function will fail with `ValueError` if +the stream has filter functions in the tree. Default - True. +- `allow_updates`: If False, this function will fail with `ValueError` if +the stream has update functions in the tree. Default - True. +- `allow_expands`: If False, this function will fail with `ValueError` if +the stream has functions with "expand=True" in the tree. Default - True. -- Stops consuming. -- Commits offsets, unless the consumer property 'enable.auto.commit' is set to False. -- Leaves the consumer group. +**Raises**: -Registered callbacks may be called from this method, -see `poll()` for more info. +- `ValueError`: if disallowed functions are present in the stream tree. + - +## quixstreams.core -## quixstreams.kafka.producer + - +## quixstreams.processing\_context -### Producer + + +### ProcessingContext ```python -class Producer() +@dataclasses.dataclass +class ProcessingContext() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/producer.py#L54) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/processing_context.py#L21) - +A class to share processing-related objects +between `Application` and `StreamingDataFrame` instances. -#### Producer.\_\_init\_\_ + + +#### ProcessingContext.store\_offset ```python -def __init__(broker_address: str, - partitioner: Partitioner = "murmur2", - extra_config: Optional[dict] = None) +def store_offset(topic: str, partition: int, offset: int) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/producer.py#L55) - -A wrapper around `confluent_kafka.Producer`. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/processing_context.py#L41) -It initializes `confluent_kafka.Producer` on demand -avoiding network calls during `__init__`, provides typing info for methods -and some reasonable defaults. +Store the offset of the processed message to the checkpoint. **Arguments**: -- `broker_address`: Kafka broker host and port in format `:`. -Passed as `bootstrap.servers` to `confluent_kafka.Producer`. -- `partitioner`: A function to be used to determine the outgoing message -partition. -Available values: "random", "consistent_random", "murmur2", "murmur2_random", -"fnv1a", "fnv1a_random" -Default - "murmur2". -- `extra_config`: A dictionary with additional options that -will be passed to `confluent_kafka.Producer` as is. -Note: values passed as arguments override values in `extra_config`. +- `topic`: topic name +- `partition`: partition number +- `offset`: message offset - + -#### Producer.produce +#### ProcessingContext.init\_checkpoint ```python -def produce(topic: str, - value: Optional[Union[str, bytes]] = None, - key: Optional[Union[str, bytes]] = None, - headers: Optional[Headers] = None, - partition: Optional[int] = None, - timestamp: Optional[int] = None, - poll_timeout: float = 5.0, - buffer_error_max_tries: int = 3) +def init_checkpoint() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/producer.py#L94) - -Produce message to topic. - -It also polls Kafka for callbacks before producing in order to minimize -the probability of `BufferError`. -If `BufferError` still happens, the method will poll Kafka with timeout -to free up the buffer and try again. - -**Arguments**: +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/processing_context.py#L51) -- `topic`: topic name -- `value`: message value -- `key`: message key -- `headers`: message headers -- `partition`: topic partition -- `timestamp`: message timestamp -- `poll_timeout`: timeout for `poll()` call in case of `BufferError` -- `buffer_error_max_tries`: max retries for `BufferError`. -Pass `0` to not retry after `BufferError`. +Initialize a new checkpoint - + -#### Producer.poll +#### ProcessingContext.commit\_checkpoint ```python -def poll(timeout: float = 0) +def commit_checkpoint(force: bool = False) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/producer.py#L152) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/processing_context.py#L62) -Polls the producer for events and calls `on_delivery` callbacks. +Commit the current checkpoint. -**Arguments**: +The actual commit will happen only when: -- `timeout`: poll timeout seconds; Default: 0 (unlike others) -> NOTE: -1 will hang indefinitely if there are no messages to acknowledge +1. The checkpoint has at least one stored offset +2. The checkpoint is expired or `force=True` is passed - +**Arguments**: -#### Producer.flush +- `force`: if `True`, commit the checkpoint before its expiration deadline. -```python -def flush(timeout: Optional[float] = None) -> int -``` + -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/kafka/producer.py#L160) +## quixstreams.utils -Wait for all messages in the Producer queue to be delivered. + -**Arguments**: +## quixstreams.utils.dicts -- `timeout` (`float`): time to attempt flushing (seconds). -None or -1 is infinite. Default: None + -**Returns**: +#### dict\_values -number of messages remaining to flush +```python +def dict_values(d: object) -> List +``` - +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/utils/dicts.py#L4) -## quixstreams.models +Recursively unpacks a set of nested dicts to get a flattened list of leaves, - +where "leaves" are the first non-dict item. -## quixstreams.models.serializers +i.e {"a": {"b": {"c": 1}, "d": 2}, "e": 3} becomes [1, 2, 3] - +**Arguments**: -## quixstreams.models.serializers.json +- `d`: initially, a dict (with potentially nested dicts) - +**Returns**: -### JSONSerializer +a list with all the leaves of the various contained dicts -```python -class JSONSerializer(Serializer) -``` + -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/json.py#L13) +## quixstreams.utils.json - + -#### JSONSerializer.\_\_init\_\_ +#### dumps ```python -def __init__(dumps: Callable[[Any], Union[str, bytes]] = default_dumps) +def dumps(value: Any) -> bytes ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/json.py#L14) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/utils/json.py#L8) -Serializer that returns data in json format. +Serialize to JSON using `orjson` package. **Arguments**: -- `dumps`: a function to serialize objects to json. -Default - :py:func:`quixstreams.utils.json.dumps` +- `value`: value to serialize to JSON - +**Returns**: -### JSONDeserializer +bytes + + + +#### loads ```python -class JSONDeserializer(Deserializer) +def loads(value: bytes) -> Any ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/json.py#L35) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/utils/json.py#L18) - +Deserialize from JSON using `orjson` package. -#### JSONDeserializer.\_\_init\_\_ +Main differences: +- It returns `bytes` +- It doesn't allow non-str keys in dictionaries -```python -def __init__(column_name: Optional[str] = None, - loads: Callable[[Union[bytes, bytearray]], Any] = default_loads) -``` +**Arguments**: -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/json.py#L36) +- `value`: value to deserialize from -Deserializer that parses data from JSON +**Returns**: -**Arguments**: +object -- `column_name`: if provided, the deserialized value will be wrapped into -dictionary with `column_name` as a key. -- `loads`: function to parse json from bytes. -Default - :py:func:`quixstreams.utils.json.loads`. + - +## quixstreams.types -## quixstreams.models.serializers.simple\_types + - +## quixstreams.models.timestamps -### BytesDeserializer + + +### TimestampType ```python -class BytesDeserializer(Deserializer) +class TimestampType(enum.IntEnum) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/simple_types.py#L44) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/timestamps.py#L8) -A deserializer to bypass bytes without any changes + - +#### TIMESTAMP\_NOT\_AVAILABLE -### BytesSerializer +timestamps not supported by broker -```python -class BytesSerializer(Serializer) -``` + -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/simple_types.py#L55) +#### TIMESTAMP\_CREATE\_TIME -A serializer to bypass bytes without any changes +message creation time (or source / producer time) - + -### StringDeserializer +#### TIMESTAMP\_LOG\_APPEND\_TIME + +broker receive time + + + +### MessageTimestamp ```python -class StringDeserializer(Deserializer) +class MessageTimestamp() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/simple_types.py#L64) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/timestamps.py#L14) - +Represents a timestamp of incoming Kafka message. -#### StringDeserializer.\_\_init\_\_ +It is made pseudo-immutable (i.e. public attributes don't have setters), and +it should not be mutated during message processing. + + + +#### MessageTimestamp.create ```python -def __init__(column_name: Optional[str] = None, codec: str = "utf_8") +@classmethod +def create(cls, timestamp_type: int, milliseconds: int) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/simple_types.py#L65) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/timestamps.py#L41) -Deserializes bytes to strings using the specified encoding. +Create a Timestamp object based on data + +from `confluent_kafka.Message.timestamp()`. + +If timestamp type is "TIMESTAMP_NOT_AVAILABLE", the milliseconds are set to None **Arguments**: -- `codec`: string encoding -A wrapper around `confluent_kafka.serialization.StringDeserializer`. +- `timestamp_type`: a timestamp type represented as a number +Can be one of: +- "0" - TIMESTAMP_NOT_AVAILABLE, timestamps not supported by broker. +- "1" - TIMESTAMP_CREATE_TIME, message creation time (or source / producer time). +- "2" - TIMESTAMP_LOG_APPEND_TIME, broker receive time. +- `milliseconds`: the number of milliseconds since the epoch (UTC). - +**Returns**: -### IntegerDeserializer +Timestamp object -```python -class IntegerDeserializer(Deserializer) -``` + -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/simple_types.py#L84) +## quixstreams.models -Deserializes bytes to integers. + -A wrapper around `confluent_kafka.serialization.IntegerDeserializer`. +## quixstreams.models.messagecontext - + -### DoubleDeserializer +### MessageContext ```python -class DoubleDeserializer(Deserializer) +class MessageContext() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/simple_types.py#L103) - -Deserializes float to IEEE 764 binary64. - -A wrapper around `confluent_kafka.serialization.DoubleDeserializer`. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/messagecontext.py#L7) - +An object with Kafka message properties. -### StringSerializer +It is made pseudo-immutable (i.e. public attributes don't have setters), and +it should not be mutated during message processing. -```python -class StringSerializer(Serializer) -``` + -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/simple_types.py#L122) +## quixstreams.models.types - + -#### StringSerializer.\_\_init\_\_ +### ConfluentKafkaMessageProto ```python -def __init__(codec: str = "utf_8") +class ConfluentKafkaMessageProto(Protocol) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/simple_types.py#L123) - -Serializes strings to bytes using the specified encoding. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/types.py#L12) -**Arguments**: +An interface of `confluent_kafka.Message`. -- `codec`: string encoding +Use it to not depend on exact implementation and simplify testing. - +Instances of `confluent_kafka.Message` cannot be directly created from Python, +see https://github.com/confluentinc/confluent-kafka-python/issues/1535. -### IntegerSerializer + -```python -class IntegerSerializer(Serializer) -``` +## quixstreams.models.serializers -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/simple_types.py#L135) + -Serializes integers to bytes +## quixstreams.models.serializers.exceptions - + -### DoubleSerializer +### IgnoreMessage ```python -class DoubleSerializer(Serializer) +class IgnoreMessage(exceptions.QuixException) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/simple_types.py#L148) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/exceptions.py#L46) -Serializes floats to bytes +Raise this exception from Deserializer.__call__ in order to ignore the processing +of the particular message. @@ -2549,7 +2678,7 @@ Serializes floats to bytes class QuixDeserializer(JSONDeserializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/quix.py#L73) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/quix.py#L73) Handles Deserialization for any Quix-formatted topic. @@ -2564,7 +2693,7 @@ def __init__(column_name: Optional[str] = None, loads: Callable[[Union[bytes, bytearray]], Any] = default_loads) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/quix.py#L80) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/quix.py#L80) **Arguments**: @@ -2582,7 +2711,7 @@ Default - :py:func:`quixstreams.utils.json.loads`. def split_values() -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/quix.py#L100) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/quix.py#L100) Each Quix message might contain data for multiple Rows. This property informs the downstream processors about that, so they can @@ -2597,7 +2726,7 @@ def deserialize(model_key: str, value: Union[List[Mapping], Mapping]) -> Iterable[Mapping] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/quix.py#L153) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/quix.py#L153) Deserialization function for particular data types (Timeseries or EventData). @@ -2618,7 +2747,7 @@ Iterable of dicts class QuixSerializer(JSONSerializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/quix.py#L274) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/quix.py#L274) @@ -2629,7 +2758,7 @@ def __init__(as_legacy: bool = True, dumps: Callable[[Any], Union[str, bytes]] = default_dumps) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/quix.py#L278) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/quix.py#L278) Serializer that returns data in json format. @@ -2647,7 +2776,7 @@ Default - :py:func:`quixstreams.utils.json.dumps` class QuixTimeseriesSerializer(QuixSerializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/quix.py#L321) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/quix.py#L321) Serialize data to JSON formatted according to Quix Timeseries format. @@ -2679,7 +2808,7 @@ Output: class QuixEventsSerializer(QuixSerializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/quix.py#L409) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/quix.py#L409) Serialize data to JSON formatted according to Quix EventData format. The input value is expected to be a dictionary with the following keys: @@ -2708,353 +2837,363 @@ Output: } ``` - + -## quixstreams.models.serializers.base +## quixstreams.models.serializers.simple\_types - + -### SerializationContext +### BytesDeserializer ```python -class SerializationContext() +class BytesDeserializer(Deserializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/base.py#L22) - -Provides additional context for message serialization/deserialization. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/simple_types.py#L44) -Every `Serializer` and `Deserializer` receives an instance of `SerializationContext` +A deserializer to bypass bytes without any changes - + -#### SerializationContext.to\_confluent\_ctx +### BytesSerializer ```python -def to_confluent_ctx(field: MessageField) -> _SerializationContext +class BytesSerializer(Serializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/base.py#L35) - -Convert `SerializationContext` to `confluent_kafka.SerializationContext` +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/simple_types.py#L55) -in order to re-use serialization already provided by `confluent_kafka` library. +A serializer to bypass bytes without any changes -**Arguments**: + -- `field`: instance of `confluent_kafka.serialization.MessageField` +### StringDeserializer -**Returns**: +```python +class StringDeserializer(Deserializer) +``` -instance of `confluent_kafka.serialization.SerializationContext` +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/simple_types.py#L64) - + -### Deserializer +#### StringDeserializer.\_\_init\_\_ ```python -class Deserializer(abc.ABC) +def __init__(column_name: Optional[str] = None, codec: str = "utf_8") ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/base.py#L47) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/simple_types.py#L65) - +Deserializes bytes to strings using the specified encoding. -#### Deserializer.\_\_init\_\_ +**Arguments**: + +- `codec`: string encoding +A wrapper around `confluent_kafka.serialization.StringDeserializer`. + + + +### IntegerDeserializer ```python -def __init__(column_name: Optional[str] = None, *args, **kwargs) +class IntegerDeserializer(Deserializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/base.py#L48) - -A base class for all Deserializers +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/simple_types.py#L84) -**Arguments**: +Deserializes bytes to integers. -- `column_name`: if provided, the deserialized value will be wrapped into -dictionary with `column_name` as a key. +A wrapper around `confluent_kafka.serialization.IntegerDeserializer`. - + -#### Deserializer.split\_values +### DoubleDeserializer ```python -@property -def split_values() -> bool +class DoubleDeserializer(Deserializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/base.py#L58) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/simple_types.py#L103) -Return True if the deserialized message should be considered as Iterable -and each item in it should be processed as a separate message. +Deserializes float to IEEE 764 binary64. - +A wrapper around `confluent_kafka.serialization.DoubleDeserializer`. -### Serializer + + +### StringSerializer ```python -class Serializer(abc.ABC) +class StringSerializer(Serializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/base.py#L74) - -A base class for all Serializers +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/simple_types.py#L122) - + -#### Serializer.extra\_headers +#### StringSerializer.\_\_init\_\_ ```python -@property -def extra_headers() -> MessageHeadersMapping +def __init__(codec: str = "utf_8") ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/base.py#L80) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/simple_types.py#L123) -Informs producer to set additional headers +Serializes strings to bytes using the specified encoding. -for the message it will be serializing +**Arguments**: -Must return a dictionary with headers. -Keys must be strings, and values must be strings, bytes or None. +- `codec`: string encoding -**Returns**: + -dict with headers +### IntegerSerializer - +```python +class IntegerSerializer(Serializer) +``` -## quixstreams.models.serializers.exceptions +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/simple_types.py#L135) - +Serializes integers to bytes -### IgnoreMessage + + +### DoubleSerializer ```python -class IgnoreMessage(exceptions.QuixException) +class DoubleSerializer(Serializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/exceptions.py#L46) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/simple_types.py#L148) -Raise this exception from Deserializer.__call__ in order to ignore the processing -of the particular message. +Serializes floats to bytes - + -## quixstreams.models.topics +## quixstreams.models.serializers.json - + -## quixstreams.models.topics.manager +### JSONSerializer - +```python +class JSONSerializer(Serializer) +``` -#### affirm\_ready\_for\_create +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/json.py#L13) + + + +#### JSONSerializer.\_\_init\_\_ ```python -def affirm_ready_for_create(topics: List[Topic]) +def __init__(dumps: Callable[[Any], Union[str, bytes]] = default_dumps) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/manager.py#L19) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/json.py#L14) -Validate a list of topics is ready for creation attempt +Serializer that returns data in json format. **Arguments**: -- `topics`: list of `Topic`s +- `dumps`: a function to serialize objects to json. +Default - :py:func:`quixstreams.utils.json.dumps` - + -### TopicManager +### JSONDeserializer ```python -class TopicManager() +class JSONDeserializer(Deserializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/manager.py#L29) - -The source of all topic management with quixstreams. - -Generally initialized and managed automatically by an `Application`, -but allows a user to work with it directly when needed, such as using it alongside -a plain `Producer` to create its topics. - -See methods for details. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/json.py#L35) - + -#### TopicManager.\_\_init\_\_ +#### JSONDeserializer.\_\_init\_\_ ```python -def __init__(topic_admin: TopicAdmin, create_timeout: int = 60) +def __init__(column_name: Optional[str] = None, + loads: Callable[[Union[bytes, bytearray]], Any] = default_loads) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/manager.py#L48) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/json.py#L36) + +Deserializer that parses data from JSON **Arguments**: -- `topic_admin`: an `Admin` instance (required for some functionality) -- `create_timeout`: timeout for topic creation +- `column_name`: if provided, the deserialized value will be wrapped into +dictionary with `column_name` as a key. +- `loads`: function to parse json from bytes. +Default - :py:func:`quixstreams.utils.json.loads`. - + -#### TopicManager.changelog\_topics +## quixstreams.models.serializers.base + + + +### SerializationContext ```python -@property -def changelog_topics() -> Dict[str, Dict[str, Topic]] +class SerializationContext() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/manager.py#L71) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/base.py#L22) -Note: `Topic`s are the changelogs. +Provides additional context for message serialization/deserialization. -returns: the changelog topic dict, {topic_name: {suffix: Topic}} +Every `Serializer` and `Deserializer` receives an instance of `SerializationContext` - + -#### TopicManager.topic\_config +#### SerializationContext.to\_confluent\_ctx ```python -def topic_config(num_partitions: Optional[int] = None, - replication_factor: Optional[int] = None, - extra_config: Optional[dict] = None) -> TopicConfig +def to_confluent_ctx(field: MessageField) -> _SerializationContext ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/manager.py#L121) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/base.py#L35) -Convenience method for generating a `TopicConfig` with default settings +Convert `SerializationContext` to `confluent_kafka.SerializationContext` + +in order to re-use serialization already provided by `confluent_kafka` library. **Arguments**: -- `num_partitions`: the number of topic partitions -- `replication_factor`: the topic replication factor -- `extra_config`: other optional configuration settings +- `field`: instance of `confluent_kafka.serialization.MessageField` **Returns**: -a TopicConfig object +instance of `confluent_kafka.serialization.SerializationContext` - + -#### TopicManager.topic +### Deserializer ```python -def topic(name: str, - value_deserializer: Optional[DeserializerType] = None, - key_deserializer: Optional[DeserializerType] = "bytes", - value_serializer: Optional[SerializerType] = None, - key_serializer: Optional[SerializerType] = "bytes", - config: Optional[TopicConfig] = None, - timestamp_extractor: Optional[TimestampExtractor] = None) -> Topic +class Deserializer(abc.ABC) +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/base.py#L47) + + + +#### Deserializer.\_\_init\_\_ + +```python +def __init__(column_name: Optional[str] = None, *args, **kwargs) +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/base.py#L48) + +A base class for all Deserializers + +**Arguments**: + +- `column_name`: if provided, the deserialized value will be wrapped into +dictionary with `column_name` as a key. + + + +#### Deserializer.split\_values + +```python +@property +def split_values() -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/manager.py#L142) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/base.py#L58) -A convenience method for generating a `Topic`. Will use default config options +Return True if the deserialized message should be considered as Iterable +and each item in it should be processed as a separate message. -as dictated by the TopicManager. + -**Arguments**: +### Serializer -- `name`: topic name -- `value_deserializer`: a deserializer type for values -- `key_deserializer`: a deserializer type for keys -- `value_serializer`: a serializer type for values -- `key_serializer`: a serializer type for keys -- `config`: optional topic configurations (for creation/validation) -- `timestamp_extractor`: a callable that returns a timestamp in -milliseconds from a deserialized message. +```python +class Serializer(abc.ABC) +``` -**Returns**: +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/base.py#L74) -Topic object with creation configs +A base class for all Serializers - + -#### TopicManager.changelog\_topic +#### Serializer.extra\_headers ```python -def changelog_topic(topic_name: str, store_name: str, - consumer_group: str) -> Topic +@property +def extra_headers() -> MessageHeadersMapping ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/manager.py#L191) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/base.py#L80) -Performs all the logic necessary to generate a changelog topic based on a +Informs producer to set additional headers -"source topic" (aka input/consumed topic). +for the message it will be serializing -Its main goal is to ensure partition counts of the to-be generated changelog -match the source topic, and ensure the changelog topic is compacted. Also -enforces the serialization type. All `Topic` objects generated with this are -stored on the TopicManager. +Must return a dictionary with headers. +Keys must be strings, and values must be strings, bytes or None. -If source topic already exists, defers to the existing topic settings, else -uses the settings as defined by the `Topic` (and its defaults) as generated -by the `TopicManager`. +**Returns**: -In general, users should NOT need this; an Application knows when/how to -generate changelog topics. To turn off changelogs, init an Application with -"use_changelog_topics"=`False`. +dict with headers -**Arguments**: + -- `consumer_group`: name of consumer group (for this app) -- `topic_name`: name of consumed topic (app input topic) -> NOTE: normally contain any prefixes added by TopicManager.topic() -- `store_name`: name of the store this changelog belongs to -(default, rolling10s, etc.) +## quixstreams.models.messages -**Returns**: + -`Topic` object (which is also stored on the TopicManager) +## quixstreams.models.rows - + -#### TopicManager.create\_topics +### Row ```python -def create_topics(topics: List[Topic]) +class Row() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/manager.py#L262) - -Creates topics via an explicit list of provided `Topics`. - -Exists as a way to manually specify what topics to create; otherwise, -`create_all_topics()` is generally simpler. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/rows.py#L11) -**Arguments**: - -- `topics`: list of `Topic`s +Row is a dict-like interface on top of the message data + some Kafka props - + -#### TopicManager.create\_all\_topics +#### Row.keys ```python -def create_all_topics() +def keys() -> KeysView ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/manager.py#L277) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/rows.py#L73) -A convenience method to create all Topic objects stored on this TopicManager. +Also allows unpacking row.value via **row - + -#### TopicManager.validate\_all\_topics +#### Row.clone ```python -def validate_all_topics() +def clone(value: dict) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/manager.py#L283) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/rows.py#L85) -Validates all topics exist and changelogs have correct topic and rep factor. +Manually clone the Row; doing it this way is much faster than doing a deepcopy +on the entire Row object. -Issues are pooled and raised as an Exception once inspections are complete. + + +## quixstreams.models.topics @@ -3068,7 +3207,7 @@ Issues are pooled and raised as an Exception once inspections are complete. def convert_topic_list(topics: List[Topic]) -> List[ConfluentTopic] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/admin.py#L23) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/admin.py#L23) Converts `Topic`s to `ConfluentTopic`s as required for Confluent's @@ -3090,7 +3229,7 @@ list of confluent_kafka `ConfluentTopic`s class TopicAdmin() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/admin.py#L46) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/admin.py#L46) For performing "admin"-level operations on a Kafka cluster, mostly around topics. @@ -3104,7 +3243,7 @@ Primarily used to create and inspect topic configurations. def __init__(broker_address: str, extra_config: Optional[Mapping] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/admin.py#L53) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/admin.py#L53) **Arguments**: @@ -3119,7 +3258,7 @@ def __init__(broker_address: str, extra_config: Optional[Mapping] = None) def list_topics() -> Dict[str, ConfluentTopicMetadata] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/admin.py#L74) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/admin.py#L74) Get a list of topics and their metadata from a Kafka cluster @@ -3135,7 +3274,7 @@ a dict of topic names and their metadata objects def inspect_topics(topic_names: List[str]) -> Dict[str, Optional[TopicConfig]] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/admin.py#L83) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/admin.py#L83) A simplified way of getting the topic configurations of the provided topics @@ -3159,7 +3298,7 @@ def create_topics(topics: List[Topic], finalize_timeout: int = 60) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/admin.py#L156) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/admin.py#L156) Create the given list of topics and confirm they are ready. @@ -3172,10 +3311,6 @@ fail (it ignores issues for a topic already existing). - `timeout`: timeout of the creation broker request - `finalize_timeout`: the timeout of the topic finalizing ("ready") - - -## quixstreams.models.topics.exceptions - ## quixstreams.models.topics.topic @@ -3189,7 +3324,7 @@ fail (it ignores issues for a topic already existing). class TopicConfig() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/topic.py#L43) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/topic.py#L43) Represents all kafka-level configuration for a kafka topic. @@ -3203,7 +3338,7 @@ Generally used by Topic and any topic creation procedures. class Topic() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/topic.py#L84) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/topic.py#L84) A definition of a Kafka topic. @@ -3226,7 +3361,7 @@ def __init__( timestamp_extractor: Optional[TimestampExtractor] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/topic.py#L93) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/topic.py#L93) **Arguments**: @@ -3248,7 +3383,7 @@ milliseconds from a deserialized message. def name() -> str ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/topic.py#L122) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/topic.py#L122) Topic name @@ -3260,7 +3395,7 @@ Topic name def row_serialize(row: Row, key: Optional[Any] = None) -> KafkaMessage ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/topic.py#L132) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/topic.py#L132) Serialize Row to a Kafka message structure @@ -3282,7 +3417,7 @@ def row_deserialize( message: ConfluentKafkaMessageProto) -> Union[Row, List[Row], None] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/topic.py#L155) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/topic.py#L155) Deserialize incoming Kafka message to a Row. @@ -3294,1086 +3429,1016 @@ Deserialize incoming Kafka message to a Row. Row, list of Rows or None if the message is ignored. - - -## quixstreams.models.messagecontext - - - -### MessageContext - -```python -class MessageContext() -``` - -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/messagecontext.py#L7) - -An object with Kafka message properties. - -It is made pseudo-immutable (i.e. public attributes don't have setters), and -it should not be mutated during message processing. - - - -## quixstreams.models.messages - - - -## quixstreams.models.rows - - - -### Row - -```python -class Row() -``` - -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/rows.py#L11) - -Row is a dict-like interface on top of the message data + some Kafka props - - - -#### Row.keys - -```python -def keys() -> KeysView -``` - -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/rows.py#L73) - -Also allows unpacking row.value via **row - - - -#### Row.clone - -```python -def clone(value: dict) -> Self -``` - -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/rows.py#L85) + -Manually clone the Row; doing it this way is much faster than doing a deepcopy -on the entire Row object. +## quixstreams.models.topics.exceptions - + -## quixstreams.models.timestamps +## quixstreams.models.topics.manager - + -### TimestampType +#### affirm\_ready\_for\_create ```python -class TimestampType(enum.IntEnum) +def affirm_ready_for_create(topics: List[Topic]) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/timestamps.py#L8) - - - -#### TIMESTAMP\_NOT\_AVAILABLE - -timestamps not supported by broker - - - -#### TIMESTAMP\_CREATE\_TIME - -message creation time (or source / producer time) - - - -#### TIMESTAMP\_LOG\_APPEND\_TIME - -broker receive time - - - -### MessageTimestamp - -```python -class MessageTimestamp() -``` +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/manager.py#L19) -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/timestamps.py#L14) +Validate a list of topics is ready for creation attempt -Represents a timestamp of incoming Kafka message. +**Arguments**: -It is made pseudo-immutable (i.e. public attributes don't have setters), and -it should not be mutated during message processing. +- `topics`: list of `Topic`s - + -#### MessageTimestamp.create +### TopicManager ```python -@classmethod -def create(cls, timestamp_type: int, milliseconds: int) -> Self +class TopicManager() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/timestamps.py#L41) - -Create a Timestamp object based on data - -from `confluent_kafka.Message.timestamp()`. - -If timestamp type is "TIMESTAMP_NOT_AVAILABLE", the milliseconds are set to None - -**Arguments**: - -- `timestamp_type`: a timestamp type represented as a number -Can be one of: -- "0" - TIMESTAMP_NOT_AVAILABLE, timestamps not supported by broker. -- "1" - TIMESTAMP_CREATE_TIME, message creation time (or source / producer time). -- "2" - TIMESTAMP_LOG_APPEND_TIME, broker receive time. -- `milliseconds`: the number of milliseconds since the epoch (UTC). - -**Returns**: - -Timestamp object - - - -## quixstreams.models.types - - - -### ConfluentKafkaMessageProto +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/manager.py#L29) -```python -class ConfluentKafkaMessageProto(Protocol) -``` +The source of all topic management with quixstreams. -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/types.py#L12) +Generally initialized and managed automatically by an `Application`, +but allows a user to work with it directly when needed, such as using it alongside +a plain `Producer` to create its topics. -An interface of `confluent_kafka.Message`. +See methods for details. -Use it to not depend on exact implementation and simplify testing. + -Instances of `confluent_kafka.Message` cannot be directly created from Python, -see https://github.com/confluentinc/confluent-kafka-python/issues/1535. +#### TopicManager.\_\_init\_\_ - +```python +def __init__(topic_admin: TopicAdmin, create_timeout: int = 60) +``` -## quixstreams.platforms +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/manager.py#L48) - +**Arguments**: -## quixstreams.platforms.quix.checks +- `topic_admin`: an `Admin` instance (required for some functionality) +- `create_timeout`: timeout for topic creation - + -#### check\_state\_management\_enabled +#### TopicManager.changelog\_topics ```python -def check_state_management_enabled() +@property +def changelog_topics() -> Dict[str, Dict[str, Topic]] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/checks.py#L11) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/manager.py#L71) -Check if State Management feature is enabled for the current deployment on -Quix platform. -If it's disabled, the exception will be raised. +Note: `Topic`s are the changelogs. - +returns: the changelog topic dict, {topic_name: {suffix: Topic}} -#### check\_state\_dir + + +#### TopicManager.topic\_config ```python -def check_state_dir(state_dir: str) +def topic_config(num_partitions: Optional[int] = None, + replication_factor: Optional[int] = None, + extra_config: Optional[dict] = None) -> TopicConfig ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/checks.py#L28) - -Check if Application "state_dir" matches the state dir on Quix platform. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/manager.py#L119) -If it doesn't match, the warning will be logged. +Convenience method for generating a `TopicConfig` with default settings **Arguments**: -- `state_dir`: application state_dir path +- `num_partitions`: the number of topic partitions +- `replication_factor`: the topic replication factor +- `extra_config`: other optional configuration settings - +**Returns**: -## quixstreams.platforms.quix.config +a TopicConfig object - + -### TopicCreationConfigs +#### TopicManager.topic ```python -@dataclasses.dataclass -class TopicCreationConfigs() +def topic(name: str, + value_deserializer: Optional[DeserializerType] = None, + key_deserializer: Optional[DeserializerType] = "bytes", + value_serializer: Optional[SerializerType] = None, + key_serializer: Optional[SerializerType] = "bytes", + config: Optional[TopicConfig] = None, + timestamp_extractor: Optional[TimestampExtractor] = None) -> Topic ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/config.py#L51) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/manager.py#L140) - +A convenience method for generating a `Topic`. Will use default config options -#### name +as dictated by the TopicManager. -Required when not created by a Quix App. +**Arguments**: - +- `name`: topic name +- `value_deserializer`: a deserializer type for values +- `key_deserializer`: a deserializer type for keys +- `value_serializer`: a serializer type for values +- `key_serializer`: a serializer type for keys +- `config`: optional topic configurations (for creation/validation) +- `timestamp_extractor`: a callable that returns a timestamp in +milliseconds from a deserialized message. -#### strip\_workspace\_id\_prefix +**Returns**: + +Topic object with creation configs + + + +#### TopicManager.changelog\_topic ```python -def strip_workspace_id_prefix(workspace_id: str, s: str) -> str +def changelog_topic(topic_name: str, store_name: str, + consumer_group: str) -> Topic ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/config.py#L60) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/manager.py#L189) -Remove the workspace ID from a given string if it starts with it, +Performs all the logic necessary to generate a changelog topic based on a -typically a topic or consumer group id +"source topic" (aka input/consumed topic). + +Its main goal is to ensure partition counts of the to-be generated changelog +match the source topic, and ensure the changelog topic is compacted. Also +enforces the serialization type. All `Topic` objects generated with this are +stored on the TopicManager. + +If source topic already exists, defers to the existing topic settings, else +uses the settings as defined by the `Topic` (and its defaults) as generated +by the `TopicManager`. + +In general, users should NOT need this; an Application knows when/how to +generate changelog topics. To turn off changelogs, init an Application with +"use_changelog_topics"=`False`. **Arguments**: -- `workspace_id`: the workspace id -- `s`: the string to append to +- `consumer_group`: name of consumer group (for this app) +- `topic_name`: name of consumed topic (app input topic) +> NOTE: normally contain any prefixes added by TopicManager.topic() +- `store_name`: name of the store this changelog belongs to +(default, rolling10s, etc.) **Returns**: -the string with workspace_id prefix removed +`Topic` object (which is also stored on the TopicManager) - + -#### prepend\_workspace\_id +#### TopicManager.create\_topics ```python -def prepend_workspace_id(workspace_id: str, s: str) -> str +def create_topics(topics: List[Topic]) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/config.py#L72) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/manager.py#L260) -Add the workspace ID as a prefix to a given string if it does not have it, +Creates topics via an explicit list of provided `Topics`. -typically a topic or consumer group it +Exists as a way to manually specify what topics to create; otherwise, +`create_all_topics()` is generally simpler. **Arguments**: -- `workspace_id`: the workspace id -- `s`: the string to append to +- `topics`: list of `Topic`s -**Returns**: + -the string with workspace_id prepended +#### TopicManager.create\_all\_topics - +```python +def create_all_topics() +``` -### QuixKafkaConfigsBuilder +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/manager.py#L275) + +A convenience method to create all Topic objects stored on this TopicManager. + + + +#### TopicManager.validate\_all\_topics ```python -class QuixKafkaConfigsBuilder() +def validate_all_topics() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/config.py#L84) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/manager.py#L281) -Retrieves all the necessary information from the Quix API and builds all the -objects required to connect a confluent-kafka client to the Quix Platform. +Validates all topics exist and changelogs have correct topic and rep factor. -If not executed within the Quix platform directly, you must provide a Quix -"streaming" (aka "sdk") token, or Personal Access Token. +Issues are pooled and raised as an Exception once inspections are complete. -Ideally you also know your workspace name or id. If not, you can search for it -using a known topic name, but note the search space is limited to the access level -of your token. + -It also currently handles the app_auto_create_topics setting for Application.Quix. +## quixstreams.state.rocksdb.windowed.store - + -#### QuixKafkaConfigsBuilder.\_\_init\_\_ +### WindowedRocksDBStore ```python -def __init__(quix_sdk_token: Optional[str] = None, - workspace_id: Optional[str] = None, - workspace_cert_path: Optional[str] = None, - quix_portal_api_service: Optional[QuixPortalApiService] = None) +class WindowedRocksDBStore(RocksDBStore) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/config.py#L100) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/windowed/store.py#L10) -**Arguments**: +RocksDB-based windowed state store. -- `quix_portal_api_service`: A QuixPortalApiService instance (else generated) -- `workspace_id`: A valid Quix Workspace ID (else searched for) -- `workspace_cert_path`: path to an existing workspace cert (else retrieved) +It keeps track of individual store partitions and provides access to the +partitions' transactions. - + -#### QuixKafkaConfigsBuilder.strip\_workspace\_id\_prefix +#### WindowedRocksDBStore.\_\_init\_\_ ```python -def strip_workspace_id_prefix(s: str) -> str +def __init__( + name: str, + topic: str, + base_dir: str, + changelog_producer_factory: Optional[ChangelogProducerFactory] = None, + options: Optional[RocksDBOptionsType] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/config.py#L184) - -Remove the workspace ID from a given string if it starts with it, - -typically a topic or consumer group id +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/windowed/store.py#L18) **Arguments**: -- `s`: the string to append to +- `name`: a unique store name +- `topic`: a topic name for this store +- `base_dir`: path to a directory with the state +- `changelog_producer_factory`: a ChangelogProducerFactory instance +if using changelogs +- `options`: RocksDB options. If `None`, the default options will be used. -**Returns**: + -the string with workspace_id prefix removed +## quixstreams.state.rocksdb.windowed.partition - + -#### QuixKafkaConfigsBuilder.prepend\_workspace\_id +### WindowedRocksDBStorePartition ```python -def prepend_workspace_id(s: str) -> str +class WindowedRocksDBStorePartition(RocksDBStorePartition) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/config.py#L194) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/windowed/partition.py#L24) -Add the workspace ID as a prefix to a given string if it does not have it, +A base class to access windowed state in RocksDB. -typically a topic or consumer group it +It represents a single RocksDB database. + +Besides the data, it keeps track of the latest observed timestamp and +stores the expiration index to delete expired windows. **Arguments**: -- `s`: the string to append to +- `path`: an absolute path to the RocksDB folder +- `options`: RocksDB options. If `None`, the default options will be used. -**Returns**: + -the string with workspace_id prepended +## quixstreams.state.rocksdb.windowed.metadata - + -#### QuixKafkaConfigsBuilder.search\_for\_workspace +## quixstreams.state.rocksdb.windowed.transaction + + + +### WindowedRocksDBPartitionTransaction ```python -def search_for_workspace( - workspace_name_or_id: Optional[str] = None) -> Optional[dict] +class WindowedRocksDBPartitionTransaction(RocksDBPartitionTransaction) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/config.py#L204) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/windowed/transaction.py#L22) -Search for a workspace given an expected workspace name or id. + -**Arguments**: +#### WindowedRocksDBPartitionTransaction.expire\_windows -- `workspace_name_or_id`: the expected name or id of a workspace +```python +def expire_windows(duration_ms: int, + prefix: bytes, + grace_ms: int = 0) -> List[Tuple[Tuple[int, int], Any]] +``` -**Returns**: +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/windowed/transaction.py#L105) -the workspace data dict if search success, else None +Get a list of expired windows from RocksDB considering latest timestamp, - +window size and grace period. +It marks the latest found window as expired in the expiration index, so +calling this method multiple times will yield different results for the same +"latest timestamp". -#### QuixKafkaConfigsBuilder.get\_workspace\_info +How it works: +- First, it looks for the start time of the last expired window for the current + prefix using expiration cache. If it's found, it will be used to reduce + the search space and to avoid returning already expired windows. +- Then it goes over window segments and fetches the windows + that should be expired. +- At last, it updates the expiration cache with the start time of the latest + found windows -```python -def get_workspace_info(known_workspace_topic: Optional[str] = None) -``` +**Returns**: -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/config.py#L227) +sorted list of tuples in format `((start, end), value)` -Queries for workspace data from the Quix API, regardless of instance cache, + -and updates instance attributes from query result. +## quixstreams.state.rocksdb.windowed -**Arguments**: + -- `known_workspace_topic`: a topic you know to exist in some workspace +## quixstreams.state.rocksdb.windowed.serialization - + -#### QuixKafkaConfigsBuilder.search\_workspace\_for\_topic +#### parse\_window\_key ```python -def search_workspace_for_topic(workspace_id: str, topic: str) -> Optional[str] +def parse_window_key(key: bytes) -> Tuple[bytes, int, int] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/config.py#L254) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/windowed/serialization.py#L12) -Search through all the topics in the given workspace id to see if there is a +Parse the window key from Rocksdb into (message_key, start, end) structure. -match with the provided topic. +Expected window key format: +|| **Arguments**: -- `workspace_id`: the workspace to search in -- `topic`: the topic to search for +- `key`: a key from Rocksdb **Returns**: -the workspace_id if success, else None +a tuple with message key, start timestamp, end timestamp - + -#### QuixKafkaConfigsBuilder.search\_for\_topic\_workspace +#### encode\_window\_key ```python -def search_for_topic_workspace(topic: str) -> Optional[dict] +def encode_window_key(start_ms: int, end_ms: int) -> bytes ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/config.py#L270) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/windowed/serialization.py#L39) -Find what workspace a topic belongs to. +Encode window start and end timestamps into bytes of the following format: -If there is only one workspace altogether, it is assumed to be the workspace. -More than one means each workspace will be searched until the first hit. +```|``` + +Encoding window keys this way make them sortable in RocksDB within the same prefix. **Arguments**: -- `topic`: the topic to search for +- `start_ms`: window start in milliseconds +- `end_ms`: window end in milliseconds **Returns**: -workspace data dict if topic search success, else None +window timestamps as bytes - + -#### QuixKafkaConfigsBuilder.get\_workspace\_ssl\_cert +#### encode\_window\_prefix ```python -def get_workspace_ssl_cert( - extract_to_folder: Optional[Path] = None) -> Optional[str] +def encode_window_prefix(prefix: bytes, start_ms: int) -> bytes ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/config.py#L291) - -Gets and extracts zipped certificate from the API to provided folder if the +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/windowed/serialization.py#L53) -SSL certificate is specified in broker configuration. +Encode window prefix and start time to iterate over keys in RocksDB -If no path was provided, will dump to /tmp. Expects cert named 'ca.cert'. +Format: +```|``` **Arguments**: -- `extract_to_folder`: path to folder to dump zipped cert file to +- `prefix`: transaction prefix +- `start_ms`: window start time in milliseconds **Returns**: -full cert filepath as string or `None` if certificate is not specified - - +bytes -#### QuixKafkaConfigsBuilder.create\_topics + -```python -def create_topics(topics: List[Topic], - finalize_timeout_seconds: Optional[int] = None) -``` +## quixstreams.state.rocksdb.windowed.state -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/config.py#L368) + -Create topics in a Quix cluster. +### WindowedTransactionState -**Arguments**: +```python +class WindowedTransactionState(WindowedState) +``` -- `topics`: a list of `Topic` objects -- `finalize_timeout_seconds`: How long to wait for the topics to be -marked as "Ready" (and thus ready to produce to/consume from). +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/windowed/state.py#L9) - + -#### QuixKafkaConfigsBuilder.confirm\_topics\_exist +#### WindowedTransactionState.\_\_init\_\_ ```python -def confirm_topics_exist(topics: Union[List[Topic], List[str]]) +def __init__(transaction: "WindowedRocksDBPartitionTransaction", + prefix: bytes) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/config.py#L417) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/windowed/state.py#L12) -Confirm whether the desired set of topics exists in the Quix workspace. +A windowed state to be provided into `StreamingDataFrame` window functions. **Arguments**: -- `topics`: a list of `Topic` or topic names +- `transaction`: instance of `WindowedRocksDBPartitionTransaction` - + -#### QuixKafkaConfigsBuilder.get\_confluent\_broker\_config +#### WindowedTransactionState.get\_window ```python -def get_confluent_broker_config(known_topic: Optional[str] = None) -> dict +def get_window(start_ms: int, + end_ms: int, + default: Any = None) -> Optional[Any] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/config.py#L457) - -Get the full client config dictionary required to authenticate a confluent-kafka +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/windowed/state.py#L23) -client to a Quix platform broker/workspace. +Get the value of the window defined by `start` and `end` timestamps -The returned config can be used directly by any confluent-kafka-python consumer/ -producer (add your producer/consumer-specific configs afterward). +if the window is present in the state, else default **Arguments**: -- `known_topic`: a topic known to exist in some workspace +- `start_ms`: start of the window in milliseconds +- `end_ms`: end of the window in milliseconds +- `default`: default value to return if the key is not found **Returns**: -a dict of confluent-kafka-python client settings (see librdkafka -config for more details) +value or None if the key is not found and `default` is not provided - + -#### QuixKafkaConfigsBuilder.get\_confluent\_client\_configs +#### WindowedTransactionState.update\_window ```python -def get_confluent_client_configs( - topics: list, - consumer_group_id: Optional[str] = None -) -> Tuple[dict, List[str], Optional[str]] +def update_window(start_ms: int, end_ms: int, value: Any, timestamp_ms: int) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/config.py#L502) - -Get all the values you need in order to use a confluent_kafka-based client - -with a topic on a Quix platform broker/workspace. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/windowed/state.py#L39) -The returned config can be used directly by any confluent-kafka-python consumer/ -producer (add your producer/consumer-specific configs afterward). +Set a value for the window. -The topics and consumer group are appended with any necessary values. +This method will also update the latest observed timestamp in state partition +using the provided `timestamp`. **Arguments**: -- `topics`: list of topics -- `consumer_group_id`: consumer group id, if needed - -**Returns**: +- `start_ms`: start of the window in milliseconds +- `end_ms`: end of the window in milliseconds +- `value`: value of the window +- `timestamp_ms`: current message timestamp in milliseconds -a tuple with configs and altered versions of the topics -and consumer group name + - +#### WindowedTransactionState.get\_latest\_timestamp -## quixstreams.platforms.quix.env +```python +def get_latest_timestamp() -> int +``` - +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/windowed/state.py#L60) -### QuixEnvironment +Get the latest observed timestamp for the current state partition. -```python -class QuixEnvironment() -``` +Use this timestamp to determine if the arriving event is late and should be +discarded from the processing. -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/env.py#L7) +**Returns**: -Class to access various Quix platform environment settings +latest observed event timestamp in milliseconds - + -#### QuixEnvironment.state\_management\_enabled +#### WindowedTransactionState.expire\_windows ```python -@property -def state_management_enabled() -> bool +def expire_windows(duration_ms: int, + grace_ms: int = 0) -> List[Tuple[Tuple[int, int], Any]] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/env.py#L19) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/windowed/state.py#L72) -Check whether "State management" is enabled for the current deployment +Get a list of expired windows from RocksDB considering the current +latest timestamp, window duration and grace period. -**Returns**: +It also marks the latest found window as expired in the expiration index, so +calling this method multiple times will yield different results for the same +"latest timestamp". -True if state management is enabled, otherwise False + - +## quixstreams.state.rocksdb.options -#### QuixEnvironment.deployment\_id + + +### RocksDBOptions ```python -@property -def deployment_id() -> Optional[str] +@dataclasses.dataclass(frozen=True) +class RocksDBOptions(RocksDBOptionsType) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/env.py#L27) - -Return current Quix deployment id. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/options.py#L25) -This variable is meant to be set only by Quix Platform and only -when the application is deployed. +RocksDB database options. -**Returns**: +**Arguments**: -deployment id or None +- `dumps`: function to dump data to JSON +- `loads`: function to load data from JSON +- `open_max_retries`: number of times to retry opening the database +if it's locked by another process. To disable retrying, pass 0 +- `open_retry_backoff`: number of seconds to wait between each retry. +Please see `rocksdict.Options` for a complete description of other options. - + -#### QuixEnvironment.workspace\_id +#### RocksDBOptions.to\_options ```python -@property -def workspace_id() -> Optional[str] +def to_options() -> rocksdict.Options ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/env.py#L39) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/options.py#L53) -Return Quix workspace id if set +Convert parameters to `rocksdict.Options` **Returns**: -workspace id or None +instance of `rocksdict.Options` - + -#### QuixEnvironment.portal\_api +## quixstreams.state.rocksdb.store + + + +### RocksDBStore ```python -@property -def portal_api() -> Optional[str] +class RocksDBStore(Store) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/env.py#L47) - -Return Quix Portal API url if set +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/store.py#L19) -**Returns**: +RocksDB-based state store. -portal API URL or None +It keeps track of individual store partitions and provides access to the +partitions' transactions. - + -#### QuixEnvironment.state\_dir +#### RocksDBStore.\_\_init\_\_ ```python -@property -def state_dir() -> str +def __init__( + name: str, + topic: str, + base_dir: str, + changelog_producer_factory: Optional[ChangelogProducerFactory] = None, + options: Optional[options_type] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/env.py#L56) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/store.py#L29) -Return application state directory on Quix. +**Arguments**: -**Returns**: +- `name`: a unique store name +- `topic`: a topic name for this store +- `base_dir`: path to a directory with the state +- `changelog_producer_factory`: a ChangelogProducerFactory instance +if using changelogs +- `options`: RocksDB options. If `None`, the default options will be used. -path to state dir + + +#### RocksDBStore.topic + +```python +@property +def topic() -> str +``` - +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/store.py#L53) -## quixstreams.platforms.quix.topic\_manager +Store topic name - + -### QuixTopicManager +#### RocksDBStore.name ```python -class QuixTopicManager(TopicManager) +@property +def name() -> str ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/topic_manager.py#L9) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/store.py#L60) -The source of all topic management with quixstreams. +Store name -This is specifically for Applications using the Quix platform. + -Generally initialized and managed automatically by an `Application.Quix`, -but allows a user to work with it directly when needed, such as using it alongside -a plain `Producer` to create its topics. +#### RocksDBStore.partitions -See methods for details. +```python +@property +def partitions() -> Dict[int, RocksDBStorePartition] +``` - +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/store.py#L67) -#### QuixTopicManager.\_\_init\_\_ +Mapping of assigned store partitions + + + +#### RocksDBStore.assign\_partition ```python -def __init__(topic_admin: TopicAdmin, - quix_config_builder: QuixKafkaConfigsBuilder, - create_timeout: int = 60) +def assign_partition(partition: int) -> RocksDBStorePartition ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/topic_manager.py#L30) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/store.py#L80) -**Arguments**: +Open and assign store partition. -- `topic_admin`: an `Admin` instance -- `create_timeout`: timeout for topic creation -- `quix_config_builder`: A QuixKafkaConfigsBuilder instance, else one is -generated for you. +If the partition is already assigned, it will not re-open it and return +the existing partition instead. - +**Arguments**: -## quixstreams.platforms.quix +- `partition`: partition number - +**Returns**: -## quixstreams.platforms.quix.api +instance of`RocksDBStorePartition` - + -### QuixPortalApiService +#### RocksDBStore.revoke\_partition ```python -class QuixPortalApiService() +def revoke_partition(partition: int) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/api.py#L19) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/store.py#L117) -A light wrapper around the Quix Portal Api. If used in the Quix Platform, it will -use that workspaces auth token and portal endpoint, else you must provide it. +Revoke and close the assigned store partition. -Function names closely reflect the respective API endpoint, -each starting with the method [GET, POST, etc.] followed by the endpoint path. +If the partition is not assigned, it will log the message and return. -Results will be returned in the form of request's Response.json(), unless something -else is required. Non-200's will raise exceptions. +**Arguments**: -See the swagger documentation for more info about the endpoints. +- `partition`: partition number - + -#### QuixPortalApiService.get\_workspace\_certificate +#### RocksDBStore.start\_partition\_transaction ```python -def get_workspace_certificate( - workspace_id: Optional[str] = None) -> Optional[bytes] +def start_partition_transaction(partition: int) -> RocksDBPartitionTransaction ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/platforms/quix/api.py#L114) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/store.py#L138) -Get a workspace TLS certificate if available. +Start a new partition transaction. -Returns `None` if certificate is not specified. +`RocksDBPartitionTransaction` is the primary interface for working with data in +the underlying RocksDB. **Arguments**: -- `workspace_id`: workspace id, optional +- `partition`: partition number **Returns**: -certificate as bytes if present, or None - - +instance of `RocksDBPartitionTransaction` -## quixstreams.platforms.quix.exceptions + - +#### RocksDBStore.close -## quixstreams.state.rocksdb.serialization +```python +def close() +``` - +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/store.py#L160) -## quixstreams.state.rocksdb.windowed +Close the store and revoke all assigned partitions - + -## quixstreams.state.rocksdb.windowed.serialization +## quixstreams.state.rocksdb.partition - + -#### parse\_window\_key +### RocksDBStorePartition ```python -def parse_window_key(key: bytes) -> Tuple[bytes, int, int] +class RocksDBStorePartition(StorePartition) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/windowed/serialization.py#L12) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/partition.py#L40) -Parse the window key from Rocksdb into (message_key, start, end) structure. +A base class to access state in RocksDB. -Expected window key format: -|| +It represents a single RocksDB database. -**Arguments**: +Responsibilities: + 1. Managing access to the RocksDB instance + 2. Creating transactions to interact with data + 3. Flushing WriteBatches to the RocksDB -- `key`: a key from Rocksdb +It opens the RocksDB on `__init__`. If the db is locked by another process, +it will retry according to `open_max_retries` and `open_retry_backoff` options. -**Returns**: +**Arguments**: -a tuple with message key, start timestamp, end timestamp +- `path`: an absolute path to the RocksDB folder +- `options`: RocksDB options. If `None`, the default options will be used. - + -#### encode\_window\_key +#### RocksDBStorePartition.begin ```python -def encode_window_key(start_ms: int, end_ms: int) -> bytes +def begin() -> RocksDBPartitionTransaction ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/windowed/serialization.py#L39) - -Encode window start and end timestamps into bytes of the following format: - -```|``` - -Encoding window keys this way make them sortable in RocksDB within the same prefix. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/partition.py#L75) -**Arguments**: +Create a new `RocksDBTransaction` object. -- `start_ms`: window start in milliseconds -- `end_ms`: window end in milliseconds +Using `RocksDBTransaction` is a recommended way for accessing the data. **Returns**: -window timestamps as bytes +an instance of `RocksDBTransaction` - + -#### encode\_window\_prefix +#### RocksDBStorePartition.recover\_from\_changelog\_message ```python -def encode_window_prefix(prefix: bytes, start_ms: int) -> bytes +def recover_from_changelog_message( + changelog_message: ConfluentKafkaMessageProto, committed_offset: int) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/windowed/serialization.py#L53) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/partition.py#L128) -Encode window prefix and start time to iterate over keys in RocksDB +Updates state from a given changelog message. -Format: -```|``` +The actual update may be skipped when both conditions are met: + +- The changelog message has headers with the processed message offset. +- This processed offset is larger than the latest committed offset for the same + topic partition. + +This way the state does not apply the state changes for not-yet-committed +messages and improves the state consistency guarantees. **Arguments**: -- `prefix`: transaction prefix -- `start_ms`: window start time in milliseconds +- `changelog_message`: A raw Confluent message read from a changelog topic. +- `committed_offset`: latest committed offset for the partition -**Returns**: + -bytes +#### RocksDBStorePartition.set\_changelog\_offset - +```python +def set_changelog_offset(changelog_offset: int) +``` -## quixstreams.state.rocksdb.windowed.state +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/partition.py#L169) - +Set the changelog offset based on a message (usually an "offset-only" message). -### WindowedTransactionState +Used during recovery. -```python -class WindowedTransactionState(WindowedState) -``` +**Arguments**: -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/windowed/state.py#L9) +- `changelog_offset`: A changelog offset - + -#### WindowedTransactionState.\_\_init\_\_ +#### RocksDBStorePartition.write ```python -def __init__(transaction: "WindowedRocksDBPartitionTransaction") +def write(batch: WriteBatch) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/windowed/state.py#L12) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/partition.py#L179) -A windowed state to be provided into `StreamingDataFrame` window functions. +Write `WriteBatch` to RocksDB **Arguments**: -- `transaction`: instance of `WindowedRocksDBPartitionTransaction` +- `batch`: an instance of `rocksdict.WriteBatch` - + -#### WindowedTransactionState.get\_window +#### RocksDBStorePartition.get ```python -def get_window(start_ms: int, - end_ms: int, - default: Any = None) -> Optional[Any] +def get(key: bytes, + default: Any = None, + cf_name: str = "default") -> Union[None, bytes, Any] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/windowed/state.py#L20) - -Get the value of the window defined by `start` and `end` timestamps +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/partition.py#L186) -if the window is present in the state, else default +Get a key from RocksDB. **Arguments**: -- `start_ms`: start of the window in milliseconds -- `end_ms`: end of the window in milliseconds -- `default`: default value to return if the key is not found +- `key`: a key encoded to `bytes` +- `default`: a default value to return if the key is not found. +- `cf_name`: rocksdb column family name. Default - "default" **Returns**: -value or None if the key is not found and `default` is not provided +a value if the key is present in the DB. Otherwise, `default` - + -#### WindowedTransactionState.update\_window +#### RocksDBStorePartition.exists ```python -def update_window(start_ms: int, end_ms: int, value: Any, timestamp_ms: int) +def exists(key: bytes, cf_name: str = "default") -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/windowed/state.py#L36) - -Set a value for the window. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/partition.py#L200) -This method will also update the latest observed timestamp in state partition -using the provided `timestamp`. +Check if a key is present in the DB. **Arguments**: -- `start_ms`: start of the window in milliseconds -- `end_ms`: end of the window in milliseconds -- `value`: value of the window -- `timestamp_ms`: current message timestamp in milliseconds - - - -#### WindowedTransactionState.get\_latest\_timestamp - -```python -def get_latest_timestamp() -> int -``` - -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/windowed/state.py#L53) - -Get the latest observed timestamp for the current state partition. - -Use this timestamp to determine if the arriving event is late and should be -discarded from the processing. +- `key`: a key encoded to `bytes`. +- `cf_name`: rocksdb column family name. Default - "default" **Returns**: -latest observed event timestamp in milliseconds +`True` if the key is present, `False` otherwise. - + -#### WindowedTransactionState.expire\_windows +#### RocksDBStorePartition.get\_processed\_offset ```python -def expire_windows(duration_ms: int, - grace_ms: int = 0) -> List[Tuple[Tuple[int, int], Any]] -``` - -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/windowed/state.py#L65) - -Get a list of expired windows from RocksDB considering the current -latest timestamp, window duration and grace period. - -It also marks the latest found window as expired in the expiration index, so -calling this method multiple times will yield different results for the same -"latest timestamp". +def get_processed_offset() -> Optional[int] +``` - +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/partition.py#L211) -## quixstreams.state.rocksdb.windowed.metadata +Get last processed offset for the given partition - +**Returns**: -## quixstreams.state.rocksdb.windowed.partition +offset or `None` if there's no processed offset yet - + -### WindowedRocksDBStorePartition +#### RocksDBStorePartition.get\_changelog\_offset ```python -class WindowedRocksDBStorePartition(RocksDBStorePartition) +def get_changelog_offset() -> Optional[int] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/windowed/partition.py#L24) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/partition.py#L223) -A base class to access windowed state in RocksDB. +Get offset that the changelog is up-to-date with. -It represents a single RocksDB database. +**Returns**: -Besides the data, it keeps track of the latest observed timestamp and -stores the expiration index to delete expired windows. +offset or `None` if there's no processed offset yet -**Arguments**: + -- `path`: an absolute path to the RocksDB folder -- `options`: RocksDB options. If `None`, the default options will be used. +#### RocksDBStorePartition.close - +```python +def close() +``` -## quixstreams.state.rocksdb.windowed.store +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/partition.py#L233) - +Close the underlying RocksDB -### WindowedRocksDBStore + + +#### RocksDBStorePartition.path ```python -class WindowedRocksDBStore(RocksDBStore) +@property +def path() -> str ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/windowed/store.py#L10) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/partition.py#L246) -RocksDB-based windowed state store. +Absolute path to RocksDB database folder -It keeps track of individual store partitions and provides access to the -partitions' transactions. +**Returns**: - +file path -#### WindowedRocksDBStore.\_\_init\_\_ + + +#### RocksDBStorePartition.destroy ```python -def __init__( - name: str, - topic: str, - base_dir: str, - changelog_producer_factory: Optional[ChangelogProducerFactory] = None, - options: Optional[RocksDBOptionsType] = None) +@classmethod +def destroy(cls, path: str) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/windowed/store.py#L18) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/partition.py#L254) -**Arguments**: +Delete underlying RocksDB database -- `name`: a unique store name -- `topic`: a topic name for this store -- `base_dir`: path to a directory with the state -- `changelog_producer_factory`: a ChangelogProducerFactory instance -if using changelogs -- `options`: RocksDB options. If `None`, the default options will be used. +The database must be closed first. - +**Arguments**: -## quixstreams.state.rocksdb.windowed.transaction +- `path`: an absolute path to the RocksDB folder - + -### WindowedRocksDBPartitionTransaction +#### RocksDBStorePartition.get\_column\_family\_handle ```python -class WindowedRocksDBPartitionTransaction(RocksDBPartitionTransaction) +def get_column_family_handle(cf_name: str) -> ColumnFamily ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/windowed/transaction.py#L21) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/partition.py#L264) - +Get a column family handle to pass to it WriteBatch. -#### WindowedRocksDBPartitionTransaction.expire\_windows +This method will cache the CF handle instance to avoid creating them +repeatedly. + +**Arguments**: + +- `cf_name`: column family name + +**Returns**: + +instance of `rocksdict.ColumnFamily` + + + +#### RocksDBStorePartition.get\_column\_family ```python -def expire_windows(duration_ms: int, - grace_ms: int = 0) -> List[Tuple[Tuple[int, int], Any]] +def get_column_family(cf_name: str) -> Rdict ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/windowed/transaction.py#L79) - -Get a list of expired windows from RocksDB considering latest timestamp, +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/partition.py#L285) -window size and grace period. -It marks the latest found window as expired in the expiration index, so -calling this method multiple times will yield different results for the same -"latest timestamp". +Get a column family instance. -How it works: -- First, it looks for the start time of the last expired window for the current - prefix using expiration cache. If it's found, it will be used to reduce - the search space and to avoid returning already expired windows. -- Then it goes over window segments and fetches the windows - that should be expired. -- At last, it updates the expiration cache with the start time of the latest - found windows +This method will cache the CF instance to avoid creating them repeatedly. -**Returns**: +**Arguments**: -sorted list of tuples in format `((start, end), value)` +- `cf_name`: column family name - +**Returns**: -## quixstreams.state.rocksdb +instance of `rocksdict.Rdict` for the given column family @@ -4391,7 +4456,7 @@ sorted list of tuples in format `((start, end), value)` class RocksDBPartitionTransaction(PartitionTransaction) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/transaction.py#L71) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/transaction.py#L61) A transaction class to perform simple key-value operations like "get", "set", "delete" and "exists" on a single RocksDB partition. @@ -4402,10 +4467,9 @@ Serialization Prefixing ********* -`RocksDBTransaction` allows to set prefixes for the keys in the given code block -using :meth:`with_prefix()` context manager. -Normally, `StreamingDataFrame` class will use message keys as prefixes -in order to namespace the stored keys across different messages. +Methods `get()`, `set()`, `delete()` and `exists()` methods require prefixes for +the keys. +Normally, the Kafka message keys are supposed to be used as prefixes. Transactional properties ************************ @@ -4415,7 +4479,7 @@ in a single batch, flush them atomically, and allow the updates be visible within the transaction before it's flushed (aka "read-your-own-writes" problem). If any mutation fails during the transaction -(e.g. we failed to write the updates to the RocksDB), the whole transaction +(e.g., failed to write the updates to the RocksDB), the whole transaction will be marked as failed and cannot be used anymore. In this case, a new `RocksDBTransaction` should be created. @@ -4426,11 +4490,13 @@ In this case, a new `RocksDBTransaction` should be created. #### RocksDBPartitionTransaction.\_\_init\_\_ ```python -def __init__(partition: "RocksDBStorePartition", dumps: DumpsFunc, - loads: LoadsFunc) +def __init__(partition: "RocksDBStorePartition", + dumps: DumpsFunc, + loads: LoadsFunc, + changelog_producer: Optional[ChangelogProducer] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/transaction.py#L114) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/transaction.py#L100) **Arguments**: @@ -4439,46 +4505,19 @@ the underlying RocksDB - `dumps`: a function to serialize data to bytes. - `loads`: a function to deserialize data from bytes. - - -#### RocksDBPartitionTransaction.with\_prefix - -```python -@contextlib.contextmanager -def with_prefix(prefix: Any = b"") -> Self -``` - -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/transaction.py#L141) - -A context manager set the prefix for all keys in the scope. - -Normally, it's called by Streaming DataFrames engine to ensure that every -message key is stored separately. - -The `with_prefix` calls should not be nested. -Only one prefix can be set at a time. - -**Arguments**: - -- `prefix`: a prefix string to be used. -Should be either `bytes` or object serializable to `bytes` -by `dumps` function. -The prefix doesn't need to contain the separator, it will be added -automatically between the key and the prefix if the prefix -is not empty. - #### RocksDBPartitionTransaction.get ```python -@_validate_transaction_state +@_validate_transaction_status(PartitionTransactionStatus.STARTED) def get(key: Any, + prefix: bytes, default: Any = None, cf_name: str = "default") -> Optional[Any] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/transaction.py#L170) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/transaction.py#L124) Get a key from the store. @@ -4490,6 +4529,7 @@ It returns `None` if the key is not found and `default` is not provided. **Arguments**: - `key`: a key to get from DB +- `prefix`: a key prefix - `default`: value to return if the key is not present in the state. It can be of any type. - `cf_name`: rocksdb column family name. Default - "default" @@ -4503,11 +4543,11 @@ value or `default` #### RocksDBPartitionTransaction.set ```python -@_validate_transaction_state -def set(key: Any, value: Any, cf_name: str = "default") +@_validate_transaction_status(PartitionTransactionStatus.STARTED) +def set(key: Any, value: Any, prefix: bytes, cf_name: str = "default") ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/transaction.py#L205) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/transaction.py#L164) Set a key to the store. @@ -4516,6 +4556,7 @@ It first updates the key in the update cache. **Arguments**: - `key`: key to store in DB +- `prefix`: a key prefix - `value`: value to store in DB - `cf_name`: rocksdb column family name. Default - "default" @@ -4524,11 +4565,11 @@ It first updates the key in the update cache. #### RocksDBPartitionTransaction.delete ```python -@_validate_transaction_state -def delete(key: Any, cf_name: str = "default") +@_validate_transaction_status(PartitionTransactionStatus.STARTED) +def delete(key: Any, prefix: bytes, cf_name: str = "default") ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/transaction.py#L230) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/transaction.py#L187) Delete a key from the store. @@ -4536,7 +4577,8 @@ It first deletes the key from the update cache. **Arguments**: -- `key`: key to delete from DB +- `key`: a key to delete from DB +- `prefix`: a key prefix - `cf_name`: rocksdb column family name. Default - "default" @@ -4544,11 +4586,11 @@ It first deletes the key from the update cache. #### RocksDBPartitionTransaction.exists ```python -@_validate_transaction_state -def exists(key: Any, cf_name: str = "default") -> bool +@_validate_transaction_status(PartitionTransactionStatus.STARTED) +def exists(key: Any, prefix: bytes, cf_name: str = "default") -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/transaction.py#L253) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/transaction.py#L208) Check if a key exists in the store. @@ -4557,12 +4599,70 @@ It first looks up the key in the update cache. **Arguments**: - `key`: a key to check in DB +- `prefix`: a key prefix - `cf_name`: rocksdb column family name. Default - "default" **Returns**: `True` if the key exists, `False` otherwise. + + +#### RocksDBPartitionTransaction.prepare + +```python +@_validate_transaction_status(PartitionTransactionStatus.STARTED) +def prepare(processed_offset: int) +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/transaction.py#L235) + +Produce changelog messages to the changelog topic for all changes accumulated + +in this transaction and prepare transaction to flush its state to the state +store. + +After successful `prepare()`, the transaction status is changed to PREPARED, +and it cannot receive updates anymore. + +If changelog is disabled for this application, no updates will be produced +to the changelog topic. + +**Arguments**: + +- `processed_offset`: the offset of the latest processed message + + + +#### RocksDBPartitionTransaction.flush + +```python +@_validate_transaction_status(PartitionTransactionStatus.STARTED, + PartitionTransactionStatus.PREPARED) +def flush(processed_offset: Optional[int] = None, + changelog_offset: Optional[int] = None) +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/transaction.py#L259) + +Flush the recent updates to the database. + +It writes the WriteBatch to RocksDB and marks itself as finished. + +If writing fails, the transaction is marked as failed and +cannot be used anymore. + +>***NOTE:*** If no keys have been modified during the transaction + (i.e. no "set" or "delete" have been called at least once), it will + not flush ANY data to the database including the offset to optimize + I/O. + +**Arguments**: + +- `processed_offset`: offset of the last processed message, optional. +- `changelog_offset`: offset of the last produced changelog message, +optional. + #### RocksDBPartitionTransaction.completed @@ -4572,7 +4672,7 @@ It first looks up the key in the update cache. def completed() -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/transaction.py#L275) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/transaction.py#L294) Check if the transaction is completed. @@ -4585,6 +4685,26 @@ The completed transaction should not be re-used. `True` if transaction is completed, `False` otherwise. + + +#### RocksDBPartitionTransaction.prepared + +```python +@property +def prepared() -> bool +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/transaction.py#L308) + +Check if the transaction is in PREPARED status. + +Prepared transaction successfully flushed its changelog and cannot receive +updates anymore, but its state is not yet flushed to the disk + +**Returns**: + +`True` if transaction is prepared, `False` otherwise. + #### RocksDBPartitionTransaction.failed @@ -4594,7 +4714,7 @@ The completed transaction should not be re-used. def failed() -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/transaction.py#L289) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/transaction.py#L320) Check if the transaction has failed. @@ -4605,414 +4725,482 @@ and `True` if transaction is failed, `False` otherwise. - + -#### RocksDBPartitionTransaction.maybe\_flush +#### RocksDBPartitionTransaction.changelog\_topic\_partition ```python -@_validate_transaction_state -def maybe_flush(offset: Optional[int] = None) +@property +def changelog_topic_partition() -> Optional[Tuple[str, int]] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/transaction.py#L318) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/transaction.py#L332) -Flush the recent updates to the database and empty the update cache. +Return the changelog topic-partition for the StorePartition of this transaction. -It writes the WriteBatch to RocksDB and marks itself as finished. +Returns `None` if changelog_producer is not provided. -If writing fails, the transaction will be also marked as "failed" and -cannot be used anymore. +**Returns**: ->***NOTE:*** If no keys have been modified during the transaction - (i.e. no "set" or "delete" have been called at least once), it will - not flush ANY data to the database including the offset in order to optimize - I/O. +(topic, partition) or None + + + +#### RocksDBPartitionTransaction.as\_state + +```python +def as_state(prefix: Any = DEFAULT_PREFIX) -> TransactionState +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/transaction.py#L346) + +Create a one-time use `TransactionState` object with a limited CRUD interface + +to be provided to `StreamingDataFrame` operations. + +The `TransactionState` will prefix all the keys with the supplied `prefix` +for all underlying operations. **Arguments**: -- `offset`: offset of the last processed message, optional. +- `prefix`: a prefix to be used for all keys - +**Returns**: -## quixstreams.state.rocksdb.partition +an instance of `TransactionState` - + -### RocksDBStorePartition +## quixstreams.state.rocksdb + + + +## quixstreams.state.rocksdb.types + + + +## quixstreams.state.rocksdb.exceptions + + + +## quixstreams.state.rocksdb.serialization + + + +## quixstreams.state.recovery + + + +### RecoveryPartition ```python -class RocksDBStorePartition(StorePartition) +class RecoveryPartition() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/partition.py#L40) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/recovery.py#L24) -A base class to access state in RocksDB. +A changelog topic partition mapped to a respective `StorePartition` with helper +methods to determine its current recovery status. -It represents a single RocksDB database. +Since `StorePartition`s do recovery directly, it also handles recovery transactions. -Responsibilities: - 1. Managing access to the RocksDB instance - 2. Creating transactions to interact with data - 3. Flushing WriteBatches to the RocksDB - 4. Producing state-related changelog messages + -It opens the RocksDB on `__init__`. If the db is locked by another process, -it will retry according to `open_max_retries` and `open_retry_backoff` options. +#### RecoveryPartition.offset + +```python +@property +def offset() -> int +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/recovery.py#L63) + +Get the changelog offset from the underlying `StorePartition`. + +**Returns**: + +changelog offset (int) + + + +#### RecoveryPartition.needs\_recovery + +```python +@property +def needs_recovery() +``` -**Arguments**: +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/recovery.py#L72) -- `path`: an absolute path to the RocksDB folder -- `options`: RocksDB options. If `None`, the default options will be used. +Determine whether recovery is necessary for underlying `StorePartition`. - + -#### RocksDBStorePartition.begin +#### RecoveryPartition.needs\_offset\_update ```python -def begin() -> RocksDBPartitionTransaction +@property +def needs_offset_update() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/partition.py#L80) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/recovery.py#L81) -Create a new `RocksDBTransaction` object. +Determine if an offset update is required. -Using `RocksDBTransaction` is a recommended way for accessing the data. +Usually checked during assign if recovery was not required. -**Returns**: + -an instance of `RocksDBTransaction` +#### RecoveryPartition.update\_offset - +```python +def update_offset() +``` -#### RocksDBStorePartition.recover\_from\_changelog\_message +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/recovery.py#L89) + +Update only the changelog offset of a StorePartition. + + + +#### RecoveryPartition.recover\_from\_changelog\_message ```python def recover_from_changelog_message( changelog_message: ConfluentKafkaMessageProto) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/partition.py#L106) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/recovery.py#L109) -Updates state from a given changelog message. +Recover the StorePartition using a message read from its respective changelog. **Arguments**: -- `changelog_message`: A raw Confluent message read from a changelog topic. +- `changelog_message`: A confluent kafka message (everything as bytes) - + -#### RocksDBStorePartition.set\_changelog\_offset +#### RecoveryPartition.set\_watermarks ```python -def set_changelog_offset(changelog_offset: int) +def set_watermarks(lowwater: int, highwater: int) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/partition.py#L130) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/recovery.py#L121) -Set the changelog offset based on a message (usually an "offset-only" message). - -Used during recovery. +Set the changelog watermarks as gathered from Consumer.get_watermark_offsets() **Arguments**: -- `changelog_offset`: A changelog offset +- `lowwater`: topic partition lowwater +- `highwater`: topic partition highwater - + -#### RocksDBStorePartition.produce\_to\_changelog +### ChangelogProducerFactory ```python -def produce_to_changelog(key: bytes, - value: Optional[bytes] = None, - headers: Optional[MessageHeadersMapping] = None) +class ChangelogProducerFactory() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/partition.py#L140) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/recovery.py#L132) -Produce a message to the StorePartitions respective changelog. +Generates ChangelogProducers, which produce changelog messages to a StorePartition. - + -#### RocksDBStorePartition.write +#### ChangelogProducerFactory.\_\_init\_\_ ```python -def write(batch: WriteBatch) +def __init__(changelog_name: str, producer: RowProducer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/partition.py#L151) - -Write `WriteBatch` to RocksDB +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/recovery.py#L137) **Arguments**: -- `batch`: an instance of `rocksdict.WriteBatch` +- `changelog_name`: changelog topic name +- `producer`: a RowProducer (not shared with `Application` instance) - +**Returns**: -#### RocksDBStorePartition.get +a ChangelogWriter instance + + + +#### ChangelogProducerFactory.get\_partition\_producer ```python -def get(key: bytes, - default: Any = None, - cf_name: str = "default") -> Union[None, bytes, Any] +def get_partition_producer(partition_num) -> "ChangelogProducer" ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/partition.py#L158) - -Get a key from RocksDB. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/recovery.py#L147) -**Arguments**: +Generate a ChangelogProducer for producing to a specific partition number -- `key`: a key encoded to `bytes` -- `default`: a default value to return if the key is not found. -- `cf_name`: rocksdb column family name. Default - "default" +(and thus StorePartition). -**Returns**: +**Arguments**: -a value if the key is present in the DB. Otherwise, `default` +- `partition_num`: source topic partition number - + -#### RocksDBStorePartition.exists +### ChangelogProducer ```python -def exists(key: bytes, cf_name: str = "default") -> bool +class ChangelogProducer() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/partition.py#L172) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/recovery.py#L161) -Check if a key is present in the DB. +Generated for a `StorePartition` to produce state changes to its respective +kafka changelog partition. -**Arguments**: + -- `key`: a key encoded to `bytes`. -- `cf_name`: rocksdb column family name. Default - "default" +#### ChangelogProducer.\_\_init\_\_ -**Returns**: +```python +def __init__(changelog_name: str, partition: int, producer: RowProducer) +``` -`True` if the key is present, `False` otherwise. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/recovery.py#L167) - +**Arguments**: -#### RocksDBStorePartition.get\_processed\_offset +- `changelog_name`: A changelog topic name +- `partition`: source topic partition number +- `producer`: a RowProducer (not shared with `Application` instance) + + + +#### ChangelogProducer.produce ```python -def get_processed_offset() -> Optional[int] +def produce(key: bytes, + value: Optional[bytes] = None, + headers: Optional[MessageHeadersMapping] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/partition.py#L183) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/recovery.py#L190) -Get last processed offset for the given partition +Produce a message to a changelog topic partition. -**Returns**: +**Arguments**: -offset or `None` if there's no processed offset yet +- `key`: message key (same as state key, including prefixes) +- `value`: message value (same as state value) +- `headers`: message headers (includes column family info) - + -#### RocksDBStorePartition.get\_changelog\_offset +### RecoveryManager ```python -def get_changelog_offset() -> Optional[int] +class RecoveryManager() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/partition.py#L195) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/recovery.py#L215) -Get offset that the changelog is up-to-date with. +Manages all consumer-related aspects of recovery, including: + - assigning/revoking, pausing/resuming topic partitions (especially changelogs) + - consuming changelog messages until state is updated fully. -**Returns**: +Also tracks/manages `RecoveryPartitions`, which are assigned/tracked only if +recovery for that changelog partition is required. -offset or `None` if there's no processed offset yet +Recovery is attempted from the `Application` after any new partition assignment. - + -#### RocksDBStorePartition.close +#### RecoveryManager.partitions ```python -def close() +@property +def partitions() -> Dict[int, Dict[str, RecoveryPartition]] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/partition.py#L205) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/recovery.py#L234) -Close the underlying RocksDB +Returns a mapping of assigned RecoveryPartitions in the following format: +{: {: }} - + -#### RocksDBStorePartition.path +#### RecoveryManager.has\_assignments ```python @property -def path() -> str +def has_assignments() -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/partition.py#L220) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/recovery.py#L242) -Absolute path to RocksDB database folder +Whether the Application has assigned RecoveryPartitions **Returns**: -file path +has assignments, as bool - + -#### RocksDBStorePartition.destroy +#### RecoveryManager.recovering ```python -@classmethod -def destroy(cls, path: str) +@property +def recovering() -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/partition.py#L228) - -Delete underlying RocksDB database +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/recovery.py#L251) -The database must be closed first. +Whether the Application is currently recovering -**Arguments**: +**Returns**: -- `path`: an absolute path to the RocksDB folder +is recovering, as bool - + -#### RocksDBStorePartition.get\_column\_family\_handle +#### RecoveryManager.register\_changelog ```python -def get_column_family_handle(cf_name: str) -> ColumnFamily +def register_changelog(topic_name: str, store_name: str, + consumer_group: str) -> Topic ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/partition.py#L238) - -Get a column family handle to pass to it WriteBatch. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/recovery.py#L259) -This method will cache the CF handle instance to avoid creating them -repeatedly. +Register a changelog Topic with the TopicManager. **Arguments**: -- `cf_name`: column family name - -**Returns**: - -instance of `rocksdict.ColumnFamily` +- `topic_name`: source topic name +- `store_name`: name of the store +- `consumer_group`: name of the consumer group - + -#### RocksDBStorePartition.get\_column\_family +#### RecoveryManager.do\_recovery ```python -def get_column_family(cf_name: str) -> Rdict +def do_recovery() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/partition.py#L259) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/recovery.py#L275) -Get a column family instance. +If there are any active RecoveryPartitions, do a recovery procedure. -This method will cache the CF instance to avoid creating them repeatedly. +After, will resume normal `Application` processing. -**Arguments**: + -- `cf_name`: column family name +#### RecoveryManager.assign\_partition -**Returns**: +```python +def assign_partition(topic: str, partition: int, committed_offset: int, + store_partitions: Dict[str, StorePartition]) +``` -instance of `rocksdict.Rdict` for the given column family +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/recovery.py#L328) - +Assigns `StorePartition`s (as `RecoveryPartition`s) ONLY IF recovery required. -## quixstreams.state.rocksdb.store +Pauses active consumer partitions as needed. - + -### RocksDBStore +#### RecoveryManager.revoke\_partition ```python -class RocksDBStore(Store) +def revoke_partition(partition_num: int) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/store.py#L19) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/recovery.py#L395) -RocksDB-based state store. +revoke ALL StorePartitions (across all Stores) for a given partition number -It keeps track of individual store partitions and provides access to the -partitions' transactions. +**Arguments**: - +- `partition_num`: partition number of source topic -#### RocksDBStore.\_\_init\_\_ + + +## quixstreams.state + + + +## quixstreams.state.types + + + +### Store ```python -def __init__( - name: str, - topic: str, - base_dir: str, - changelog_producer_factory: Optional[ChangelogProducerFactory] = None, - options: Optional[options_type] = None) +class Store(Protocol) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/store.py#L29) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L11) -**Arguments**: +Abstract state store. -- `name`: a unique store name -- `topic`: a topic name for this store -- `base_dir`: path to a directory with the state -- `changelog_producer_factory`: a ChangelogProducerFactory instance -if using changelogs -- `options`: RocksDB options. If `None`, the default options will be used. +It keeps track of individual store partitions and provides access to the +partitions' transactions. - + -#### RocksDBStore.topic +#### Store.topic ```python @property def topic() -> str ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/store.py#L53) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L22) -Store topic name +Topic name - + -#### RocksDBStore.name +#### Store.name ```python @property def name() -> str ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/store.py#L60) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L29) Store name - + -#### RocksDBStore.partitions +#### Store.partitions ```python @property -def partitions() -> Dict[int, RocksDBStorePartition] +def partitions() -> Dict[int, "StorePartition"] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/store.py#L67) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L36) Mapping of assigned store partitions - +**Returns**: -#### RocksDBStore.assign\_partition +dict of "{partition: }" + + + +#### Store.assign\_partition ```python -def assign_partition(partition: int) -> RocksDBStorePartition +def assign_partition(partition: int) -> "StorePartition" ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/store.py#L80) - -Open and assign store partition. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L43) -If the partition is already assigned, it will not re-open it and return -the existing partition instead. +Assign new store partition **Arguments**: @@ -5020,40 +5208,37 @@ the existing partition instead. **Returns**: -instance of`RocksDBStorePartition` +instance of `StorePartition` - + -#### RocksDBStore.revoke\_partition +#### Store.revoke\_partition ```python def revoke_partition(partition: int) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/store.py#L117) - -Revoke and close the assigned store partition. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L52) -If the partition is not assigned, it will log the message and return. +Revoke assigned store partition **Arguments**: - `partition`: partition number - + -#### RocksDBStore.start\_partition\_transaction +#### Store.start\_partition\_transaction ```python -def start_partition_transaction(partition: int) -> RocksDBPartitionTransaction +def start_partition_transaction(partition: int) -> "PartitionTransaction" ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/store.py#L138) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L60) Start a new partition transaction. -`RocksDBPartitionTransaction` is the primary interface for working with data in -the underlying RocksDB. +`PartitionTransaction` is the primary interface for working with data in Stores. **Arguments**: @@ -5061,109 +5246,148 @@ the underlying RocksDB. **Returns**: -instance of `RocksDBPartitionTransaction` +instance of `PartitionTransaction` - + -#### RocksDBStore.close +#### Store.close ```python def close() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/store.py#L160) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L69) -Close the store and revoke all assigned partitions +Close store and revoke all store partitions - + -## quixstreams.state.rocksdb.exceptions +### StorePartition - +```python +class StorePartition(Protocol) +``` -## quixstreams.state.rocksdb.types +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L80) - +A base class to access state in the underlying storage. +It represents a single instance of some storage (e.g. a single database for +the persistent storage). -## quixstreams.state.rocksdb.options + - +#### StorePartition.path -### RocksDBOptions +```python +@property +def path() -> str +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L89) + +Absolute path to RocksDB database folder + + + +#### StorePartition.begin ```python -@dataclasses.dataclass(frozen=True) -class RocksDBOptions(RocksDBOptionsType) +def begin() -> "PartitionTransaction" ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/options.py#L25) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L95) -RocksDB database options. +State new `PartitionTransaction` + + + +#### StorePartition.recover\_from\_changelog\_message + +```python +def recover_from_changelog_message( + changelog_message: ConfluentKafkaMessageProto, committed_offset: int) +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L100) + +Updates state from a given changelog message. **Arguments**: -- `dumps`: function to dump data to JSON -- `loads`: function to load data from JSON -- `open_max_retries`: number of times to retry opening the database -if it's locked by another process. To disable retrying, pass 0 -- `open_retry_backoff`: number of seconds to wait between each retry. -Please see `rocksdict.Options` for a complete description of other options. +- `changelog_message`: A raw Confluent message read from a changelog topic. +- `committed_offset`: latest committed offset for the partition - + -#### RocksDBOptions.to\_options +#### StorePartition.get\_processed\_offset ```python -def to_options() -> rocksdict.Options +def get_processed_offset() -> Optional[int] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/options.py#L53) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L111) -Convert parameters to `rocksdict.Options` +Get last processed offset for the given partition **Returns**: -instance of `rocksdict.Options` - - - -## quixstreams.state.state +offset or `None` if there's no processed offset yet - + -### TransactionState +#### StorePartition.get\_changelog\_offset ```python -class TransactionState(State) +def get_changelog_offset() -> Optional[int] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/state.py#L6) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L118) - +Get offset that the changelog is up-to-date with. -#### TransactionState.\_\_init\_\_ +**Returns**: + +offset or `None` if there's no processed offset yet + + + +#### StorePartition.set\_changelog\_offset ```python -def __init__(transaction: PartitionTransaction) +def set_changelog_offset(changelog_offset: int) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/state.py#L9) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L125) -Simple key-value state to be provided into `StreamingDataFrame` functions +Set the changelog offset based on a message (usually an "offset-only" message). + +Used during recovery. **Arguments**: -- `transaction`: instance of `PartitionTransaction` +- `changelog_offset`: A changelog offset - + -#### TransactionState.get +### State + +```python +class State(Protocol) +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L136) + +Primary interface for working with key-value state data from `StreamingDataFrame` + + + +#### State.get ```python def get(key: Any, default: Any = None) -> Optional[Any] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/state.py#L17) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L141) Get the value for key if key is present in the state, else default @@ -5176,15 +5400,15 @@ Get the value for key if key is present in the state, else default value or None if the key is not found and `default` is not provided - + -#### TransactionState.set +#### State.set ```python def set(key: Any, value: Any) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/state.py#L27) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L151) Set value for the key. @@ -5193,15 +5417,15 @@ Set value for the key. - `key`: key - `value`: value - + -#### TransactionState.delete +#### State.delete ```python def delete(key: Any) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/state.py#L35) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L159) Delete value for the key. @@ -5211,15 +5435,15 @@ This function always returns `None`, even if value is not found. - `key`: key - + -#### TransactionState.exists +#### State.exists ```python def exists(key: Any) -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/state.py#L44) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L168) Check if the key exists in state. @@ -5231,908 +5455,904 @@ Check if the key exists in state. True if key exists, False otherwise - - -## quixstreams.state - - - -## quixstreams.state.manager - - - -### StateStoreManager - -```python -class StateStoreManager() -``` - -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/manager.py#L31) - -Class for managing state stores and partitions. - -StateStoreManager is responsible for: - - reacting to rebalance callbacks - - managing the individual state stores - - providing access to store transactions - - + -#### StateStoreManager.stores +### PartitionTransaction ```python -@property -def stores() -> Dict[str, Dict[str, Store]] +class PartitionTransaction(Protocol) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/manager.py#L71) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L177) -Map of registered state stores - -**Returns**: - -dict in format {topic: {store_name: store}} +A transaction class to perform simple key-value operations like +"get", "set", "delete" and "exists" on a single storage partition. - + -#### StateStoreManager.recovery\_required +#### PartitionTransaction.as\_state ```python -@property -def recovery_required() -> bool +def as_state(prefix: Any) -> State ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/manager.py#L79) - -Whether recovery needs to be done. - - - -#### StateStoreManager.using\_changelogs - -```python -@property -def using_changelogs() -> bool -``` +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L183) -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/manager.py#L88) +Create an instance implementing the `State` protocol to be provided -Whether the StateStoreManager is using changelog topics +to `StreamingDataFrame` functions. +All operations called on this State object will be prefixed with +the supplied `prefix`. **Returns**: -using changelogs, as bool +an instance implementing the `State` protocol - + -#### StateStoreManager.do\_recovery +#### PartitionTransaction.get ```python -def do_recovery() +def get(key: Any, prefix: bytes, default: Any = None) -> Optional[Any] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/manager.py#L96) - -Perform a state recovery, if necessary. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L194) - +Get the value for key if key is present in the state, else default -#### StateStoreManager.stop\_recovery +**Arguments**: -```python -def stop_recovery() -``` +- `key`: key +- `prefix`: a key prefix +- `default`: default value to return if the key is not found -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/manager.py#L102) +**Returns**: -Stop recovery (called during app shutdown). +value or None if the key is not found and `default` is not provided - + -#### StateStoreManager.get\_store +#### PartitionTransaction.set ```python -def get_store(topic: str, - store_name: str = _DEFAULT_STATE_STORE_NAME) -> Store +def set(key: Any, prefix: bytes, value: Any) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/manager.py#L108) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L205) -Get a store for given name and topic +Set value for the key. **Arguments**: -- `topic`: topic name -- `store_name`: store name - -**Returns**: - -instance of `Store` +- `key`: key +- `prefix`: a key prefix +- `value`: value - + -#### StateStoreManager.register\_store +#### PartitionTransaction.delete ```python -def register_store(topic_name: str, - store_name: str = _DEFAULT_STATE_STORE_NAME) +def delete(key: Any, prefix: bytes) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/manager.py#L141) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L214) -Register a state store to be managed by StateStoreManager. - -During processing, the StateStoreManager will react to rebalancing callbacks -and assign/revoke the partitions for registered stores. +Delete value for the key. -Each store can be registered only once for each topic. +This function always returns `None`, even if value is not found. **Arguments**: -- `topic_name`: topic name -- `store_name`: store name +- `key`: key +- `prefix`: a key prefix - + -#### StateStoreManager.register\_windowed\_store +#### PartitionTransaction.exists ```python -def register_windowed_store(topic_name: str, store_name: str) +def exists(key: Any, prefix: bytes) -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/manager.py#L166) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L224) -Register a windowed state store to be managed by StateStoreManager. +Check if the key exists in state. -During processing, the StateStoreManager will react to rebalancing callbacks -and assign/revoke the partitions for registered stores. +**Arguments**: -Each window store can be registered only once for each topic. +- `key`: key +- `prefix`: a key prefix -**Arguments**: +**Returns**: -- `topic_name`: topic name -- `store_name`: store name +True if key exists, False otherwise - + -#### StateStoreManager.clear\_stores +#### PartitionTransaction.failed ```python -def clear_stores() +@property +def failed() -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/manager.py#L189) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L234) -Delete all state stores managed by StateStoreManager. +Return `True` if transaction failed to update data at some point. - +Failed transactions cannot be re-used. -#### StateStoreManager.on\_partition\_assign +**Returns**: -```python -def on_partition_assign(tp: TopicPartition) -> List[StorePartition] -``` +bool -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/manager.py#L204) + -Assign store partitions for each registered store for the given `TopicPartition` +#### PartitionTransaction.completed -and return a list of assigned `StorePartition` objects. +```python +@property +def completed() -> bool +``` -**Arguments**: +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L244) -- `tp`: `TopicPartition` from Kafka consumer +Return `True` if transaction is successfully completed. + +Completed transactions cannot be re-used. **Returns**: -list of assigned `StorePartition` +bool - + -#### StateStoreManager.on\_partition\_revoke +#### PartitionTransaction.prepared ```python -def on_partition_revoke(tp: TopicPartition) +@property +def prepared() -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/manager.py#L223) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L254) -Revoke store partitions for each registered store for the given `TopicPartition` +Return `True` if transaction is prepared completed. -**Arguments**: +Prepared transactions cannot receive new updates, but can be flushed. + +**Returns**: -- `tp`: `TopicPartition` from Kafka consumer +bool - + -#### StateStoreManager.on\_partition\_lost +#### PartitionTransaction.prepare ```python -def on_partition_lost(tp: TopicPartition) +def prepare(processed_offset: int) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/manager.py#L235) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L263) + +Produce changelog messages to the changelog topic for all changes accumulated + +in this transaction and prepare transcation to flush its state to the state +store. -Revoke and close store partitions for each registered store for the given +After successful `prepare()`, the transaction status is changed to PREPARED, +and it cannot receive updates anymore. -`TopicPartition` +If changelog is disabled for this application, no updates will be produced +to the changelog topic. **Arguments**: -- `tp`: `TopicPartition` from Kafka consumer +- `processed_offset`: the offset of the latest processed message - + -#### StateStoreManager.init +#### PartitionTransaction.changelog\_topic\_partition ```python -def init() +@property +def changelog_topic_partition() -> Optional[Tuple[str, int]] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/manager.py#L244) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L279) -Initialize `StateStoreManager` and create a store directory +Return the changelog topic-partition for the StorePartition of this transaction. +Returns `None` if changelog_producer is not provided. - +**Returns**: -#### StateStoreManager.close +(topic, partition) or None + + + +#### PartitionTransaction.flush ```python -def close() +def flush(processed_offset: Optional[int] = None, + changelog_offset: Optional[int] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/manager.py#L251) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L288) -Close all registered stores +Flush the recent updates to the storage. - +**Arguments**: -#### StateStoreManager.get\_store\_transaction +- `processed_offset`: offset of the last processed message, optional. +- `changelog_offset`: offset of the last produced changelog message, +optional. -```python -def get_store_transaction( - store_name: str = _DEFAULT_STATE_STORE_NAME) -> PartitionTransaction -``` + -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/manager.py#L259) +### WindowedState -Get active `PartitionTransaction` for the store +```python +class WindowedState(Protocol) +``` -**Arguments**: +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L306) -- `store_name`: +A windowed state to be provided into `StreamingDataFrame` window functions. - + -#### StateStoreManager.start\_store\_transaction +#### WindowedState.get\_window ```python -@contextlib.contextmanager -def start_store_transaction(topic: str, partition: int, - offset: int) -> Iterator["_MultiStoreTransaction"] +def get_window(start_ms: int, + end_ms: int, + default: Any = None) -> Optional[Any] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/manager.py#L274) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L311) -Starting the multi-store transaction for the Kafka message. - -This transaction will keep track of all used stores and flush them in the end. -If any exception is caught during this transaction, none of them -will be flushed as a best effort to keep stores consistent in "at-least-once" setting. +Get the value of the window defined by `start` and `end` timestamps -There can be only one active transaction at a time. Starting a new transaction -before the end of the current one will fail. +if the window is present in the state, else default **Arguments**: -- `topic`: message topic -- `partition`: message partition -- `offset`: message offset +- `start_ms`: start of the window in milliseconds +- `end_ms`: end of the window in milliseconds +- `default`: default value to return if the key is not found - +**Returns**: -## quixstreams.state.recovery +value or None if the key is not found and `default` is not provided - + -### RecoveryPartition +#### WindowedState.update\_window ```python -class RecoveryPartition() +def update_window(start_ms: int, end_ms: int, value: Any, timestamp_ms: int) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/recovery.py#L20) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L325) -A changelog topic partition mapped to a respective `StorePartition` with helper -methods to determine its current recovery status. +Set a value for the window. -Since `StorePartition`s do recovery directly, it also handles recovery transactions. +This method will also update the latest observed timestamp in state partition +using the provided `timestamp`. - +**Arguments**: -#### RecoveryPartition.offset +- `start_ms`: start of the window in milliseconds +- `end_ms`: end of the window in milliseconds +- `value`: value of the window +- `timestamp_ms`: current message timestamp in milliseconds + + + +#### WindowedState.get\_latest\_timestamp ```python -@property -def offset() -> int +def get_latest_timestamp() -> int ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/recovery.py#L41) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L339) -Get the changelog offset from the underlying `StorePartition`. +Get the latest observed timestamp for the current state partition. + +Use this timestamp to determine if the arriving event is late and should be +discarded from the processing. **Returns**: -changelog offset (int) +latest observed event timestamp in milliseconds - + -#### RecoveryPartition.needs\_recovery +#### WindowedState.expire\_windows ```python -@property -def needs_recovery() +def expire_windows(duration_ms: int, grace_ms: int = 0) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/recovery.py#L50) - -Determine whether recovery is necessary for underlying `StorePartition`. - - +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L350) -#### RecoveryPartition.needs\_offset\_update +Get a list of expired windows from RocksDB considering the current -```python -@property -def needs_offset_update() -``` +latest timestamp, window duration and grace period. -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/recovery.py#L59) +It also marks the latest found window as expired in the expiration index, so +calling this method multiple times will yield different results for the same +"latest timestamp". -Determine if an offset update is required. +**Arguments**: -Usually checked during assign if recovery was not required. +- `duration_ms`: duration of the windows in milliseconds +- `grace_ms`: grace period in milliseconds. Default - "0" - + -#### RecoveryPartition.update\_offset +### WindowedPartitionTransaction ```python -def update_offset() +class WindowedPartitionTransaction(Protocol) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/recovery.py#L67) - -Update only the changelog offset of a StorePartition. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L365) - + -#### RecoveryPartition.recover\_from\_changelog\_message +#### WindowedPartitionTransaction.failed ```python -def recover_from_changelog_message( - changelog_message: ConfluentKafkaMessageProto) +@property +def failed() -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/recovery.py#L87) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L368) -Recover the StorePartition using a message read from its respective changelog. +Return `True` if transaction failed to update data at some point. -**Arguments**: +Failed transactions cannot be re-used. -- `changelog_message`: A confluent kafka message (everything as bytes) +**Returns**: - +bool -#### RecoveryPartition.set\_watermarks + + +#### WindowedPartitionTransaction.completed ```python -def set_watermarks(lowwater: int, highwater: int) +@property +def completed() -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/recovery.py#L99) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L378) -Set the changelog watermarks as gathered from Consumer.get_watermark_offsets() +Return `True` if transaction is successfully completed. -**Arguments**: +Completed transactions cannot be re-used. -- `lowwater`: topic partition lowwater -- `highwater`: topic partition highwater +**Returns**: - +bool -### ChangelogProducerFactory + + +#### WindowedPartitionTransaction.prepared ```python -class ChangelogProducerFactory() +@property +def prepared() -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/recovery.py#L110) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L388) -Generates ChangelogProducers, which produce changelog messages to a StorePartition. +Return `True` if transaction is prepared completed. - +Prepared transactions cannot receive new updates, but can be flushed. -#### ChangelogProducerFactory.\_\_init\_\_ +**Returns**: + +bool + + + +#### WindowedPartitionTransaction.prepare ```python -def __init__(changelog_name: str, producer: RowProducer) +def prepare(processed_offset: int) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/recovery.py#L115) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L397) -**Arguments**: +Produce changelog messages to the changelog topic for all changes accumulated -- `changelog_name`: changelog topic name -- `producer`: a RowProducer (not shared with `Application` instance) +in this transaction and prepare transcation to flush its state to the state +store. -**Returns**: +After successful `prepare()`, the transaction status is changed to PREPARED, +and it cannot receive updates anymore. -a ChangelogWriter instance +If changelog is disabled for this application, no updates will be produced +to the changelog topic. - +**Arguments**: -#### ChangelogProducerFactory.get\_partition\_producer +- `processed_offset`: the offset of the latest processed message + + + +#### WindowedPartitionTransaction.get\_window ```python -def get_partition_producer(partition_num) +def get_window(start_ms: int, + end_ms: int, + prefix: bytes, + default: Any = None) -> Optional[Any] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/recovery.py#L125) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L414) -Generate a ChangelogProducer for producing to a specific partition number +Get the value of the window defined by `start` and `end` timestamps -(and thus StorePartition). +if the window is present in the state, else default **Arguments**: -- `partition_num`: source topic partition number +- `start_ms`: start of the window in milliseconds +- `end_ms`: end of the window in milliseconds +- `prefix`: a key prefix +- `default`: default value to return if the key is not found - +**Returns**: -### ChangelogProducer +value or None if the key is not found and `default` is not provided + + + +#### WindowedPartitionTransaction.update\_window ```python -class ChangelogProducer() +def update_window(start_ms: int, end_ms: int, value: Any, timestamp_ms: int, + prefix: bytes) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/recovery.py#L137) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L433) -Generated for a `StorePartition` to produce state changes to its respective -kafka changelog partition. +Set a value for the window. - +This method will also update the latest observed timestamp in state partition +using the provided `timestamp`. -#### ChangelogProducer.\_\_init\_\_ +**Arguments**: + +- `start_ms`: start of the window in milliseconds +- `end_ms`: end of the window in milliseconds +- `value`: value of the window +- `timestamp_ms`: current message timestamp in milliseconds +- `prefix`: a key prefix + + + +#### WindowedPartitionTransaction.get\_latest\_timestamp ```python -def __init__(changelog_name: str, partition_num: int, producer: RowProducer) +def get_latest_timestamp() -> int ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/recovery.py#L143) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L450) -**Arguments**: +Get the latest observed timestamp for the current state partition. -- `changelog_name`: A changelog topic name -- `partition_num`: source topic partition number -- `producer`: a RowProducer (not shared with `Application` instance) +Use this timestamp to determine if the arriving event is late and should be +discarded from the processing. - +**Returns**: -#### ChangelogProducer.produce +latest observed event timestamp in milliseconds + + + +#### WindowedPartitionTransaction.expire\_windows ```python -def produce(key: bytes, - value: Optional[bytes] = None, - headers: Optional[MessageHeadersMapping] = None) +def expire_windows(duration_ms: int, prefix: bytes, grace_ms: int = 0) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/recovery.py#L153) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L461) -Produce a message to a changelog topic partition. +Get a list of expired windows from RocksDB considering the current + +latest timestamp, window duration and grace period. + +It also marks the latest found window as expired in the expiration index, so +calling this method multiple times will yield different results for the same +"latest timestamp". **Arguments**: -- `key`: message key (same as state key, including prefixes) -- `value`: message value (same as state value) -- `headers`: message headers (includes column family info) +- `duration_ms`: duration of the windows in milliseconds +- `prefix`: a key prefix +- `grace_ms`: grace period in milliseconds. Default - "0" - + -### RecoveryManager +#### WindowedPartitionTransaction.flush ```python -class RecoveryManager() +def flush(processed_offset: Optional[int] = None, + changelog_offset: Optional[int] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/recovery.py#L178) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L476) -Manages all consumer-related aspects of recovery, including: - - assigning/revoking, pausing/resuming topic partitions (especially changelogs) - - consuming changelog messages until state is updated fully. +Flush the recent updates to the storage. -Also tracks/manages `RecoveryPartitions`, which are assigned/tracked only if -recovery for that changelog partition is required. +**Arguments**: -Recovery is attempted from the `Application` after any new partition assignment. +- `processed_offset`: offset of the last processed message, optional. +- `changelog_offset`: offset of the last produced changelog message, +optional. - + -#### RecoveryManager.has\_assignments +#### WindowedPartitionTransaction.changelog\_topic\_partition ```python @property -def has_assignments() -> bool +def changelog_topic_partition() -> Optional[Tuple[str, int]] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/recovery.py#L197) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L490) -Whether the Application has assigned RecoveryPartitions +Return the changelog topic-partition for the StorePartition of this transaction. + +Returns `None` if changelog_producer is not provided. **Returns**: -has assignments, as bool +(topic, partition) or None - + -#### RecoveryManager.recovering +### PartitionRecoveryTransaction ```python -@property -def recovering() -> bool +class PartitionRecoveryTransaction(Protocol) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/recovery.py#L206) - -Whether the Application is currently recovering - -**Returns**: +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L504) -is recovering, as bool +A class for managing recovery for a StorePartition from a changelog message - + -#### RecoveryManager.register\_changelog +#### PartitionRecoveryTransaction.flush ```python -def register_changelog(topic_name: str, store_name: str, consumer_group: str) +def flush() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/recovery.py#L214) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L511) -Register a changelog Topic with the TopicManager. - -**Arguments**: - -- `topic_name`: source topic name -- `store_name`: name of the store -- `consumer_group`: name of the consumer group +Flush the recovery update to the storage. - + -#### RecoveryManager.do\_recovery +### PartitionTransactionStatus ```python -def do_recovery() +class PartitionTransactionStatus(enum.Enum) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/recovery.py#L228) - -If there are any active RecoveryPartitions, do a recovery procedure. - -After, will resume normal `Application` processing. - - +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L518) -#### RecoveryManager.assign\_partition + -```python -def assign_partition(topic_name: str, partition_num: int, - store_partitions: Dict[str, StorePartition]) -``` +#### STARTED -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/recovery.py#L274) +Transaction is started and accepts updates -Assigns `StorePartition`s (as `RecoveryPartition`s) ONLY IF recovery required. + -Pauses active consumer partitions as needed. +#### PREPARED - +Transaction is prepared, it can no longer receive updates -#### RecoveryManager.revoke\_partition + -```python -def revoke_partition(partition_num: int) -``` +#### COMPLETE -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/recovery.py#L336) +Transaction is fully completed, it cannot be used anymore -revoke ALL StorePartitions (across all Stores) for a given partition number + -**Arguments**: +#### FAILED -- `partition_num`: partition number of source topic +Transaction is failed, it cannot be used anymore ## quixstreams.state.exceptions - + -## quixstreams.state.types +## quixstreams.state.manager - + -### Store +### StateStoreManager ```python -class Store(Protocol) +class StateStoreManager() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L14) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/manager.py#L24) -Abstract state store. +Class for managing state stores and partitions. -It keeps track of individual store partitions and provides access to the -partitions' transactions. +StateStoreManager is responsible for: + - reacting to rebalance callbacks + - managing the individual state stores + - providing access to store transactions - + -#### Store.topic +#### StateStoreManager.stores ```python @property -def topic() -> str +def stores() -> Dict[str, Dict[str, Store]] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L25) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/manager.py#L63) -Topic name +Map of registered state stores - +**Returns**: -#### Store.name +dict in format {topic: {store_name: store}} + + + +#### StateStoreManager.recovery\_required ```python @property -def name() -> str +def recovery_required() -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L32) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/manager.py#L71) -Store name +Whether recovery needs to be done. - + -#### Store.partitions +#### StateStoreManager.using\_changelogs ```python @property -def partitions() -> Dict[int, "StorePartition"] +def using_changelogs() -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L39) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/manager.py#L80) -Mapping of assigned store partitions +Whether the StateStoreManager is using changelog topics **Returns**: -dict of "{partition: }" +using changelogs, as bool - + -#### Store.assign\_partition +#### StateStoreManager.do\_recovery ```python -def assign_partition(partition: int) -> "StorePartition" +def do_recovery() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L46) - -Assign new store partition - -**Arguments**: - -- `partition`: partition number - -**Returns**: +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/manager.py#L88) -instance of `StorePartition` +Perform a state recovery, if necessary. - + -#### Store.revoke\_partition +#### StateStoreManager.stop\_recovery ```python -def revoke_partition(partition: int) +def stop_recovery() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L55) - -Revoke assigned store partition - -**Arguments**: +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/manager.py#L94) -- `partition`: partition number +Stop recovery (called during app shutdown). - + -#### Store.start\_partition\_transaction +#### StateStoreManager.get\_store ```python -def start_partition_transaction( - partition: int) -> Optional["PartitionTransaction"] +def get_store(topic: str, store_name: str = DEFAULT_STATE_STORE_NAME) -> Store ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L63) - -Start a new partition transaction. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/manager.py#L100) -`PartitionTransaction` is the primary interface for working with data in Stores. +Get a store for given name and topic **Arguments**: -- `partition`: partition number +- `topic`: topic name +- `store_name`: store name **Returns**: -instance of `PartitionTransaction` +instance of `Store` - + -#### Store.close +#### StateStoreManager.register\_store ```python -def close() +def register_store(topic_name: str, + store_name: str = DEFAULT_STATE_STORE_NAME) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L74) - -Close store and revoke all store partitions +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/manager.py#L134) - +Register a state store to be managed by StateStoreManager. -### StorePartition +During processing, the StateStoreManager will react to rebalancing callbacks +and assign/revoke the partitions for registered stores. -```python -class StorePartition(Protocol) -``` +Each store can be registered only once for each topic. -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L85) +**Arguments**: -A base class to access state in the underlying storage. -It represents a single instance of some storage (e.g. a single database for -the persistent storage). +- `topic_name`: topic name +- `store_name`: store name - + -#### StorePartition.path +#### StateStoreManager.register\_windowed\_store ```python -@property -def path() -> str +def register_windowed_store(topic_name: str, store_name: str) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L94) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/manager.py#L159) -Absolute path to RocksDB database folder +Register a windowed state store to be managed by StateStoreManager. - +During processing, the StateStoreManager will react to rebalancing callbacks +and assign/revoke the partitions for registered stores. -#### StorePartition.begin +Each window store can be registered only once for each topic. + +**Arguments**: + +- `topic_name`: topic name +- `store_name`: store name + + + +#### StateStoreManager.clear\_stores ```python -def begin() -> "PartitionTransaction" +def clear_stores() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L100) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/manager.py#L182) -State new `PartitionTransaction` +Delete all state stores managed by StateStoreManager. - + -#### StorePartition.recover\_from\_changelog\_message +#### StateStoreManager.on\_partition\_assign ```python -def recover_from_changelog_message( - changelog_message: ConfluentKafkaMessageProto) +def on_partition_assign(topic: str, partition: int, + committed_offset: int) -> List[StorePartition] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L105) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/manager.py#L197) -Updates state from a given changelog message. +Assign store partitions for each registered store for the given `TopicPartition` + +and return a list of assigned `StorePartition` objects. **Arguments**: -- `changelog_message`: A raw Confluent message read from a changelog topic. +- `topic`: Kafka topic name +- `partition`: Kafka topic partition +- `committed_offset`: latest committed offset for the partition + +**Returns**: + +list of assigned `StorePartition` - + -#### StorePartition.produce\_to\_changelog +#### StateStoreManager.on\_partition\_revoke ```python -def produce_to_changelog(key: bytes, - value: Optional[bytes] = None, - headers: Optional[MessageHeadersMapping] = None) +def on_partition_revoke(topic: str, partition: int) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L115) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/manager.py#L223) -Produce a message to the StorePartitions respective changelog. +Revoke store partitions for each registered store for the given `TopicPartition` - +**Arguments**: + +- `topic`: Kafka topic name +- `partition`: Kafka topic partition + + -#### StorePartition.get\_processed\_offset +#### StateStoreManager.init ```python -def get_processed_offset() -> Optional[int] +def init() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L126) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/manager.py#L236) -Get last processed offset for the given partition - -**Returns**: +Initialize `StateStoreManager` and create a store directory -offset or `None` if there's no processed offset yet - + -#### StorePartition.get\_changelog\_offset +#### StateStoreManager.close ```python -def get_changelog_offset() -> Optional[int] +def close() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L133) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/manager.py#L243) -Get offset that the changelog is up-to-date with. +Close all registered stores -**Returns**: + -offset or `None` if there's no processed offset yet +## quixstreams.state.state - + -#### StorePartition.set\_changelog\_offset +### TransactionState ```python -def set_changelog_offset(changelog_offset: int) +class TransactionState(State) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L140) - -Set the changelog offset based on a message (usually an "offset-only" message). - -Used during recovery. - -**Arguments**: - -- `changelog_offset`: A changelog offset +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/state.py#L6) - + -### State +#### TransactionState.\_\_init\_\_ ```python -class State(Protocol) +def __init__(prefix: bytes, transaction: PartitionTransaction) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L151) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/state.py#L12) -Primary interface for working with key-value state data from `StreamingDataFrame` +Simple key-value state to be provided into `StreamingDataFrame` functions - +**Arguments**: -#### State.get +- `transaction`: instance of `PartitionTransaction` + + + +#### TransactionState.get ```python def get(key: Any, default: Any = None) -> Optional[Any] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L156) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/state.py#L21) Get the value for key if key is present in the state, else default @@ -6145,15 +6365,15 @@ Get the value for key if key is present in the state, else default value or None if the key is not found and `default` is not provided - + -#### State.set +#### TransactionState.set ```python def set(key: Any, value: Any) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L166) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/state.py#L31) Set value for the key. @@ -6162,15 +6382,15 @@ Set value for the key. - `key`: key - `value`: value - + -#### State.delete +#### TransactionState.delete ```python def delete(key: Any) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L174) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/state.py#L39) Delete value for the key. @@ -6180,15 +6400,15 @@ This function always returns `None`, even if value is not found. - `key`: key - + -#### State.exists +#### TransactionState.exists ```python def exists(key: Any) -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L183) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/state.py#L48) Check if the key exists in state. @@ -6200,743 +6420,813 @@ Check if the key exists in state. True if key exists, False otherwise - + -### PartitionTransaction +## quixstreams.exceptions + + + +## quixstreams.exceptions.assignment + + + +### PartitionAssignmentError ```python -class PartitionTransaction(State) +class PartitionAssignmentError(QuixException) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L192) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/exceptions/assignment.py#L6) -A transaction class to perform simple key-value operations like -"get", "set", "delete" and "exists" on a single storage partition. +Error happened during partition rebalancing. +Raised from `on_assign`, `on_revoke` and `on_lost` callbacks + + + +## quixstreams.exceptions.base + + + +## quixstreams.context - + -#### PartitionTransaction.state +#### set\_message\_context ```python -@property -def state() -> State +def set_message_context(context: Optional[MessageContext]) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L199) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/context.py#L21) -An instance of State to be provided to `StreamingDataFrame` functions +Set a MessageContext for the current message in the given `contextvars.Context` +>***NOTE:*** This is for advanced usage only. If you need to change the message key, +`StreamingDataFrame.to_topic()` has an argument for it. - -#### PartitionTransaction.failed +Example Snippet: ```python -@property -def failed() -> bool -``` - -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L207) +from quixstreams import Application, set_message_context, message_context -Return `True` if transaction failed to update data at some point. +# Changes the current sdf value based on what the message partition is. +def alter_context(value): + context = message_context() + if value > 1: + context.headers = context.headers + (b"cool_new_header", value.encode()) + set_message_context(context) -Failed transactions cannot be re-used. +app = Application() +sdf = app.dataframe() +sdf = sdf.update(lambda value: alter_context(value)) +``` -**Returns**: +**Arguments**: -bool +- `context`: instance of `MessageContext` - + -#### PartitionTransaction.completed +#### message\_context ```python -@property -def completed() -> bool +def message_context() -> MessageContext ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L217) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/context.py#L52) -Return `True` if transaction is completed. +Get a MessageContext for the current message, which houses most of the message -Completed transactions cannot be re-used. +metadata, like: + - key + - timestamp + - partition + - offset -**Returns**: -bool +Example Snippet: - +```python +from quixstreams import Application, message_context -#### PartitionTransaction.with\_prefix +# Changes the current sdf value based on what the message partition is. -```python -@contextlib.contextmanager -def with_prefix(prefix: Any = b"") -> Iterator[Self] +app = Application() +sdf = app.dataframe() +sdf = sdf.apply(lambda value: 1 if message_context().partition == 2 else 0) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L227) - -A context manager set the prefix for all keys in the scope. +**Returns**: -Normally, it's called by `StreamingDataFrame` internals to ensure that every -message key is stored separately. +instance of `MessageContext` -**Arguments**: + -- `prefix`: key prefix +#### message\_key -**Returns**: +```python +def message_key() -> Any +``` -context manager +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/context.py#L83) - +Get the current message's key. -#### PartitionTransaction.maybe\_flush +Example Snippet: ```python -def maybe_flush(offset: Optional[int] = None) +from quixstreams import Application, message_key + +# Changes the current sdf value based on what the message key is. + +app = Application() +sdf = app.dataframe() +sdf = sdf.apply(lambda value: 1 if message_key() == b'1' else 0) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L238) +**Returns**: -Flush the recent updates and last processed offset to the storage. +a deserialized message key -**Arguments**: + -- `offset`: offset of the last processed message, optional. +## quixstreams.kafka - + -### WindowedState +## quixstreams.kafka.producer + + + +### Producer ```python -class WindowedState(Protocol) +class Producer() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L249) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/producer.py#L37) -A windowed state to be provided into `StreamingDataFrame` window functions. - - + -#### WindowedState.get\_window +#### Producer.\_\_init\_\_ ```python -def get_window(start_ms: int, - end_ms: int, - default: Any = None) -> Optional[Any] +def __init__(broker_address: str, + partitioner: Partitioner = "murmur2", + extra_config: Optional[dict] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L254) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/producer.py#L38) -Get the value of the window defined by `start` and `end` timestamps +A wrapper around `confluent_kafka.Producer`. -if the window is present in the state, else default +It initializes `confluent_kafka.Producer` on demand +avoiding network calls during `__init__`, provides typing info for methods +and some reasonable defaults. **Arguments**: -- `start_ms`: start of the window in milliseconds -- `end_ms`: end of the window in milliseconds -- `default`: default value to return if the key is not found - -**Returns**: - -value or None if the key is not found and `default` is not provided +- `broker_address`: Kafka broker host and port in format `:`. +Passed as `bootstrap.servers` to `confluent_kafka.Producer`. +- `partitioner`: A function to be used to determine the outgoing message +partition. +Available values: "random", "consistent_random", "murmur2", "murmur2_random", +"fnv1a", "fnv1a_random" +Default - "murmur2". +- `extra_config`: A dictionary with additional options that +will be passed to `confluent_kafka.Producer` as is. +Note: values passed as arguments override values in `extra_config`. - + -#### WindowedState.update\_window +#### Producer.produce ```python -def update_window(start_ms: int, end_ms: int, value: Any, timestamp_ms: int) +def produce(topic: str, + value: Optional[Union[str, bytes]] = None, + key: Optional[Union[str, bytes]] = None, + headers: Optional[Headers] = None, + partition: Optional[int] = None, + timestamp: Optional[int] = None, + poll_timeout: float = 5.0, + buffer_error_max_tries: int = 3, + on_delivery: Optional[DeliveryCallback] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L268) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/producer.py#L74) -Set a value for the window. +Produce a message to a topic. -This method will also update the latest observed timestamp in state partition -using the provided `timestamp`. +It also polls Kafka for callbacks before producing to minimize +the probability of `BufferError`. +If `BufferError` still happens, the method will poll Kafka with timeout +to free up the buffer and try again. **Arguments**: -- `start_ms`: start of the window in milliseconds -- `end_ms`: end of the window in milliseconds -- `value`: value of the window -- `timestamp_ms`: current message timestamp in milliseconds +- `topic`: topic name +- `value`: message value +- `key`: message key +- `headers`: message headers +- `partition`: topic partition +- `timestamp`: message timestamp +- `poll_timeout`: timeout for `poll()` call in case of `BufferError` +- `buffer_error_max_tries`: max retries for `BufferError`. +Pass `0` to not retry after `BufferError`. +- `on_delivery`: the delivery callback to be triggered on `poll()` +for the produced message. - + -#### WindowedState.get\_latest\_timestamp +#### Producer.poll ```python -def get_latest_timestamp() -> int +def poll(timeout: float = 0) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L282) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/producer.py#L135) -Get the latest observed timestamp for the current state partition. - -Use this timestamp to determine if the arriving event is late and should be -discarded from the processing. +Polls the producer for events and calls `on_delivery` callbacks. -**Returns**: +**Arguments**: -latest observed event timestamp in milliseconds +- `timeout`: poll timeout seconds; Default: 0 (unlike others) +> NOTE: -1 will hang indefinitely if there are no messages to acknowledge - + -#### WindowedState.expire\_windows +#### Producer.flush ```python -def expire_windows(duration_ms: int, grace_ms: int = 0) +def flush(timeout: Optional[float] = None) -> int ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L293) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/producer.py#L143) -Get a list of expired windows from RocksDB considering the current +Wait for all messages in the Producer queue to be delivered. -latest timestamp, window duration and grace period. +**Arguments**: -It also marks the latest found window as expired in the expiration index, so -calling this method multiple times will yield different results for the same -"latest timestamp". +- `timeout` (`float`): time to attempt flushing (seconds). +None or -1 is infinite. Default: None -**Arguments**: +**Returns**: -- `duration_ms`: duration of the windows in milliseconds -- `grace_ms`: grace period in milliseconds. Default - "0" +number of messages remaining to flush - + -### WindowedPartitionTransaction +## quixstreams.kafka.consumer + + + +### Consumer ```python -class WindowedPartitionTransaction(WindowedState) +class Consumer() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L308) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L66) - + -#### WindowedPartitionTransaction.failed +#### Consumer.\_\_init\_\_ ```python -@property -def failed() -> bool +def __init__(broker_address: str, + consumer_group: Optional[str], + auto_offset_reset: AutoOffsetReset, + auto_commit_enable: bool = True, + assignment_strategy: AssignmentStrategy = "range", + on_commit: Optional[Callable[ + [Optional[KafkaError], List[TopicPartition]], None]] = None, + extra_config: Optional[dict] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L313) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L67) -Return `True` if transaction failed to update data at some point. +A wrapper around `confluent_kafka.Consumer`. -Failed transactions cannot be re-used. +It initializes `confluent_kafka.Consumer` on demand +avoiding network calls during `__init__`, provides typing info for methods +and some reasonable defaults. -**Returns**: +**Arguments**: -bool +- `broker_address`: Kafka broker host and port in format `:`. +Passed as `bootstrap.servers` to `confluent_kafka.Consumer`. +- `consumer_group`: Kafka consumer group. +Passed as `group.id` to `confluent_kafka.Consumer` +- `auto_offset_reset`: Consumer `auto.offset.reset` setting. +Available values: +- "earliest" - automatically reset the offset to the smallest offset +- "latest" - automatically reset the offset to the largest offset +- "error" - trigger an error (ERR__AUTO_OFFSET_RESET) which is retrieved + by consuming messages (used for testing) +- `auto_commit_enable`: If true, periodically commit offset of +the last message handed to the application. Default - `True`. +- `assignment_strategy`: The name of a partition assignment strategy. +Available values: "range", "roundrobin", "cooperative-sticky". +- `on_commit`: Offset commit result propagation callback. +Passed as "offset_commit_cb" to `confluent_kafka.Consumer`. +- `extra_config`: A dictionary with additional options that +will be passed to `confluent_kafka.Consumer` as is. +Note: values passed as arguments override values in `extra_config`. - + -#### WindowedPartitionTransaction.completed +#### Consumer.poll ```python -@property -def completed() -> bool +def poll(timeout: Optional[float] = None) -> Optional[Message] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L323) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L126) -Return `True` if transaction is completed. +Consumes a single message, calls callbacks and returns events. -Completed transactions cannot be re-used. +The application must check the returned :py:class:`Message` +object's :py:func:`Message.error()` method to distinguish between proper +messages (error() returns None), or an event or error. + +Note: Callbacks may be called from this method, such as +``on_assign``, ``on_revoke``, et al. + +**Arguments**: + +- `timeout` (`float`): Maximum time in seconds to block waiting for message, +event or callback. None or -1 is infinite. Default: None. + +**Raises**: + +- `None`: RuntimeError if called on a closed consumer **Returns**: -bool +A Message object or None on timeout - + -#### WindowedPartitionTransaction.with\_prefix +#### Consumer.subscribe ```python -def with_prefix(prefix: Any = b"") -> Iterator[Self] +def subscribe(topics: List[str], + on_assign: Optional[RebalancingCallback] = None, + on_revoke: Optional[RebalancingCallback] = None, + on_lost: Optional[RebalancingCallback] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L332) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L144) -A context manager set the prefix for all keys in the scope. +Set subscription to supplied list of topics -Normally, it's called by `StreamingDataFrame` internals to ensure that every -message key is stored separately. +This replaces a previous subscription. **Arguments**: -- `prefix`: key prefix +- `topics` (`list(str)`): List of topics (strings) to subscribe to. +- `on_assign` (`callable`): callback to provide handling of customized offsets +on completion of a successful partition re-assignment. +- `on_revoke` (`callable`): callback to provide handling of offset commits to +a customized store on the start of a rebalance operation. +- `on_lost` (`callable`): callback to provide handling in the case the partition +assignment has been lost. Partitions that have been lost may already be +owned by other members in the group and therefore committing offsets, +for example, may fail. + +**Raises**: -**Returns**: +- `KafkaException`: +- `None`: RuntimeError if called on a closed consumer +.. py:function:: on_assign(consumer, partitions) +.. py:function:: on_revoke(consumer, partitions) +.. py:function:: on_lost(consumer, partitions) -context manager + :param Consumer consumer: Consumer instance. + :param list(TopicPartition) partitions: Absolute list of partitions being + assigned or revoked. - + -#### WindowedPartitionTransaction.maybe\_flush +#### Consumer.unsubscribe ```python -def maybe_flush(offset: Optional[int] = None) +def unsubscribe() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L343) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L238) -Flush the recent updates and last processed offset to the storage. +Remove current subscription. -**Arguments**: +**Raises**: -- `offset`: offset of the last processed message, optional. +- `None`: KafkaException +- `None`: RuntimeError if called on a closed consumer - + -### PartitionRecoveryTransaction +#### Consumer.store\_offsets ```python -class PartitionRecoveryTransaction(Protocol) +def store_offsets(message: Optional[Message] = None, + offsets: Optional[List[TopicPartition]] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L354) - -A class for managing recovery for a StorePartition from a changelog message - - - -#### PartitionRecoveryTransaction.flush +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L246) -```python -def flush() -``` +.. py:function:: store_offsets([message=None], [offsets=None]) -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L361) +Store offsets for a message or a list of offsets. -Flush the recovery update and last processed offset to the storage. +``message`` and ``offsets`` are mutually exclusive. The stored offsets +will be committed according to 'auto.commit.interval.ms' or manual +offset-less `commit`. +Note that 'enable.auto.offset.store' must be set to False when using this API. - +**Arguments**: -## quixstreams.utils +- `message` (`confluent_kafka.Message`): Store message's offset+1. +- `offsets` (`list(TopicPartition)`): List of topic+partitions+offsets to store. - +**Raises**: -## quixstreams.utils.json +- `None`: KafkaException +- `None`: RuntimeError if called on a closed consumer - + -#### dumps +#### Consumer.commit ```python -def dumps(value: Any) -> bytes +def commit(message: Optional[Message] = None, + offsets: Optional[List[TopicPartition]] = None, + asynchronous: bool = True) -> Optional[List[TopicPartition]] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/utils/json.py#L8) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L280) -Serialize to JSON using `orjson` package. +Commit a message or a list of offsets. + +The ``message`` and ``offsets`` parameters are mutually exclusive. +If neither is set, the current partition assignment's offsets are used instead. +Use this method to commit offsets if you have 'enable.auto.commit' set to False. **Arguments**: -- `value`: value to serialize to JSON +- `message` (`confluent_kafka.Message`): Commit the message's offset+1. +Note: By convention, committed offsets reflect the next message +to be consumed, **not** the last message consumed. +- `offsets` (`list(TopicPartition)`): List of topic+partitions+offsets to commit. +- `asynchronous` (`bool`): If true, asynchronously commit, returning None +immediately. If False, the commit() call will block until the commit +succeeds or fails and the committed offsets will be returned (on success). +Note that specific partitions may have failed and the .err field of +each partition should be checked for success. -**Returns**: +**Raises**: -bytes +- `None`: KafkaException +- `None`: RuntimeError if called on a closed consumer - + -#### loads +#### Consumer.committed ```python -def loads(value: bytes) -> Any +def committed(partitions: List[TopicPartition], + timeout: Optional[float] = None) -> List[TopicPartition] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/utils/json.py#L18) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L320) -Deserialize from JSON using `orjson` package. +.. py:function:: committed(partitions, [timeout=None]) -Main differences: -- It returns `bytes` -- It doesn't allow non-str keys in dictionaries +Retrieve committed offsets for the specified partitions. **Arguments**: -- `value`: value to deserialize from +- `partitions` (`list(TopicPartition)`): List of topic+partitions to query for stored offsets. +- `timeout` (`float`): Request timeout (seconds). +None or -1 is infinite. Default: None -**Returns**: +**Raises**: -object +- `None`: KafkaException +- `None`: RuntimeError if called on a closed consumer - +**Returns**: -## quixstreams.utils.dicts +`list(TopicPartition)`: List of topic+partitions with offset and possibly error set. - + -#### dict\_values +#### Consumer.get\_watermark\_offsets ```python -def dict_values(d: object) -> List +def get_watermark_offsets(partition: TopicPartition, + timeout: Optional[float] = None, + cached: bool = False) -> Tuple[int, int] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/utils/dicts.py#L4) - -Recursively unpacks a set of nested dicts to get a flattened list of leaves, - -where "leaves" are the first non-dict item. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L340) -i.e {"a": {"b": {"c": 1}, "d": 2}, "e": 3} becomes [1, 2, 3] +Retrieve low and high offsets for the specified partition. **Arguments**: -- `d`: initially, a dict (with potentially nested dicts) - -**Returns**: - -a list with all the leaves of the various contained dicts +- `partition` (`TopicPartition`): Topic+partition to return offsets for. +- `timeout` (`float`): Request timeout (seconds). None or -1 is infinite. +Ignored if cached=True. Default: None +- `cached` (`bool`): Instead of querying the broker, use cached information. +Cached values: The low offset is updated periodically +(if statistics.interval.ms is set) while the high offset is updated on each +message fetched from the broker for this partition. - +**Raises**: -## quixstreams.types +- `None`: KafkaException +- `None`: RuntimeError if called on a closed consumer - +**Returns**: -## quixstreams.logging +`tuple(int,int)`: Tuple of (low,high) on success or None on timeout. +The high offset is the offset of the last message + 1. - + -#### configure\_logging +#### Consumer.list\_topics ```python -def configure_logging(loglevel: Optional[LogLevel]) -> bool +def list_topics(topic: Optional[str] = None, + timeout: Optional[float] = None) -> ClusterMetadata ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/logging.py#L24) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L366) -Configure "quixstreams" logger. +.. py:function:: list_topics([topic=None], [timeout=-1]) ->***NOTE:*** If "quixstreams" logger already has pre-defined handlers -(e.g. logging has already been configured via `logging`, or the function -is called twice), it will skip configuration and return `False`. +Request metadata from the cluster. +This method provides the same information as +listTopics(), describeTopics() and describeCluster() in the Java Admin client. **Arguments**: -- `loglevel`: a valid log level as a string or None. -If None passed, this function is no-op and no logging will be configured. - -**Returns**: - -True if logging config has been updated, otherwise False. +- `topic` (`str`): If specified, only request information about this topic, +else return results for all topics in cluster. +Warning: If auto.create.topics.enable is set to true on the broker and +an unknown topic is specified, it will be created. +- `timeout` (`float`): The maximum response time before timing out +None or -1 is infinite. Default: None - +**Raises**: -## quixstreams.context +- `None`: KafkaException - + -#### set\_message\_context +#### Consumer.memberid ```python -def set_message_context(context: Optional[MessageContext]) +def memberid() -> str ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/context.py#L21) - -Set a MessageContext for the current message in the given `contextvars.Context` - ->***NOTE:*** This is for advanced usage only. If you need to change the message key, -`StreamingDataFrame.to_topic()` has an argument for it. - - -Example Snippet: - -```python -from quixstreams import Application, set_message_context, message_context +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L389) -# Changes the current sdf value based on what the message partition is. -def alter_context(value): - context = message_context() - if value > 1: - context.headers = context.headers + (b"cool_new_header", value.encode()) - set_message_context(context) +Return this client's broker-assigned group member id. -app = Application() -sdf = app.dataframe() -sdf = sdf.update(lambda value: alter_context(value)) -``` +The member id is assigned by the group coordinator and is propagated to +the consumer during rebalance. -**Arguments**: + :returns: Member id string or None + :rtype: string + :raises: RuntimeError if called on a closed consumer -- `context`: instance of `MessageContext` - + -#### message\_context +#### Consumer.offsets\_for\_times ```python -def message_context() -> MessageContext +def offsets_for_times(partitions: List[TopicPartition], + timeout: Optional[float] = None) -> List[TopicPartition] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/context.py#L52) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L402) -Get a MessageContext for the current message, which houses most of the message +Look up offsets by timestamp for the specified partitions. -metadata, like: - - key - - timestamp - - partition - - offset +The returned offset for each partition is the earliest offset whose +timestamp is greater than or equal to the given timestamp in the +corresponding partition. If the provided timestamp exceeds that of the +last message in the partition, a value of -1 will be returned. + :param list(TopicPartition) partitions: topic+partitions with timestamps + in the TopicPartition.offset field. + :param float timeout: The maximum response time before timing out. + None or -1 is infinite. Default: None + :returns: List of topic+partition with offset field set and possibly error set + :rtype: list(TopicPartition) + :raises: KafkaException + :raises: RuntimeError if called on a closed consumer -Example Snippet: -```python -from quixstreams import Application, message_context + -# Changes the current sdf value based on what the message partition is. +#### Consumer.pause -app = Application() -sdf = app.dataframe() -sdf = sdf.apply(lambda value: 1 if message_context().partition == 2 else 0) +```python +def pause(partitions: List[TopicPartition]) ``` -**Returns**: +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L428) -instance of `MessageContext` +Pause consumption for the provided list of partitions. - +Paused partitions must be tracked manually. -#### message\_key +Does NOT affect the result of Consumer.assignment(). -```python -def message_key() -> Any -``` +**Arguments**: -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/context.py#L83) +- `partitions` (`list(TopicPartition)`): List of topic+partitions to pause. -Get the current message's key. +**Raises**: -Example Snippet: +- `None`: KafkaException -```python -from quixstreams import Application, message_key + -# Changes the current sdf value based on what the message key is. +#### Consumer.resume -app = Application() -sdf = app.dataframe() -sdf = sdf.apply(lambda value: 1 if message_key() == b'1' else 0) +```python +def resume(partitions: List[TopicPartition]) ``` -**Returns**: - -a deserialized message key +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L442) - +.. py:function:: resume(partitions) -## quixstreams.rowconsumer +Resume consumption for the provided list of partitions. - +**Arguments**: -### RowConsumer +- `partitions` (`list(TopicPartition)`): List of topic+partitions to resume. -```python -class RowConsumer(Consumer, RowConsumerProto) -``` +**Raises**: -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/rowconsumer.py#L57) +- `None`: KafkaException - + -#### RowConsumer.\_\_init\_\_ +#### Consumer.position ```python -def __init__(broker_address: str, - consumer_group: str, - auto_offset_reset: AutoOffsetReset, - auto_commit_enable: bool = True, - assignment_strategy: AssignmentStrategy = "range", - on_commit: Callable[[Optional[KafkaError], List[TopicPartition]], - None] = None, - extra_config: Optional[dict] = None, - on_error: Optional[ConsumerErrorCallback] = None) +def position(partitions: List[TopicPartition]) -> List[TopicPartition] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/rowconsumer.py#L58) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L454) -A consumer class that is capable of deserializing Kafka messages to Rows +Retrieve current positions (offsets) for the specified partitions. -according to the Topics deserialization settings. +**Arguments**: -It overrides `.subscribe()` method of Consumer class to accept `Topic` -objects instead of strings. +- `partitions` (`list(TopicPartition)`): List of topic+partitions to return +current offsets for. The current offset is the offset of +the last consumed message + 1. -**Arguments**: +**Raises**: -- `broker_address`: Kafka broker host and port in format `:`. -Passed as `bootstrap.servers` to `confluent_kafka.Consumer`. -- `consumer_group`: Kafka consumer group. -Passed as `group.id` to `confluent_kafka.Consumer` -- `auto_offset_reset`: Consumer `auto.offset.reset` setting. -Available values: -- "earliest" - automatically reset the offset to the smallest offset -- "latest" - automatically reset the offset to the largest offset -- `auto_commit_enable`: If true, periodically commit offset of -the last message handed to the application. Default - `True`. -- `assignment_strategy`: The name of a partition assignment strategy. -Available values: "range", "roundrobin", "cooperative-sticky". -- `on_commit`: Offset commit result propagation callback. -Passed as "offset_commit_cb" to `confluent_kafka.Consumer`. -- `extra_config`: A dictionary with additional options that -will be passed to `confluent_kafka.Consumer` as is. -Note: values passed as arguments override values in `extra_config`. -- `on_error`: a callback triggered when `RowConsumer.poll_row` fails. -If consumer fails and the callback returns `True`, the exception -will be logged but not propagated. -The default callback logs an exception and returns `False`. +- `None`: KafkaException +- `None`: RuntimeError if called on a closed consumer - +**Returns**: -#### RowConsumer.subscribe +`list(TopicPartition)`: List of topic+partitions with offset and possibly error set. + + + +#### Consumer.seek ```python -def subscribe(topics: List[Topic], - on_assign: Optional[RebalancingCallback] = None, - on_revoke: Optional[RebalancingCallback] = None, - on_lost: Optional[RebalancingCallback] = None) +def seek(partition: TopicPartition) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/rowconsumer.py#L113) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L468) -Set subscription to supplied list of topics. +Set consume position for partition to offset. -This replaces a previous subscription. +The offset may be an absolute (>=0) or a +logical offset (:py:const:`OFFSET_BEGINNING` et.al). -This method also updates the internal mapping with topics that is used -to deserialize messages to Rows. +seek() may only be used to update the consume offset of an +actively consumed partition (i.e., after :py:const:`assign()`), +to set the starting offset of partition not being consumed instead +pass the offset in an `assign()` call. **Arguments**: -- `topics`: list of `Topic` instances to subscribe to. -- `on_assign` (`callable`): callback to provide handling of customized offsets -on completion of a successful partition re-assignment. -- `on_revoke` (`callable`): callback to provide handling of offset commits to -a customized store on the start of a rebalance operation. -- `on_lost` (`callable`): callback to provide handling in the case the partition -assignment has been lost. Partitions that have been lost may already be -owned by other members in the group and therefore committing offsets, -for example, may fail. +- `partition` (`TopicPartition`): Topic+partition+offset to seek to. - +**Raises**: -#### RowConsumer.poll\_row +- `None`: KafkaException + + + +#### Consumer.assignment ```python -def poll_row(timeout: float = None) -> Union[Row, List[Row], None] +def assignment() -> List[TopicPartition] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/rowconsumer.py#L147) - -Consumes a single message and deserialize it to Row or a list of Rows. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L485) -The message is deserialized according to the corresponding Topic. -If deserializer raises `IgnoreValue` exception, this method will return None. -If Kafka returns an error, it will be raised as exception. +Returns the current partition assignment. -**Arguments**: +**Raises**: -- `timeout`: poll timeout seconds +- `None`: KafkaException +- `None`: RuntimeError if called on a closed consumer **Returns**: -single Row, list of Rows or None +`list(TopicPartition)`: List of assigned topic+partitions. + + + +#### Consumer.set\_sasl\_credentials + +```python +def set_sasl_credentials(username: str, password: str) +``` - +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L498) -## quixstreams.rowproducer +Sets the SASL credentials used for this client. +These credentials will overwrite the old ones, and will be used the next +time the client needs to authenticate. +This method will not disconnect existing broker connections that have been +established with the old credentials. +This method is applicable only to SASL PLAIN and SCRAM mechanisms. - + -### RowProducer +#### Consumer.incremental\_assign ```python -class RowProducer(Producer, RowProducerProto) +def incremental_assign(partitions: List[TopicPartition]) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/rowproducer.py#L24) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L510) -A producer class that is capable of serializing Rows to bytes and send them to Kafka. - -The serialization is performed according to the Topic serialization settings. +Assign new partitions. - It overrides `.subscribe()` method of Consumer class to accept `Topic` - objects instead of strings. - - :param broker_address: Kafka broker host and port in format `:`. - Passed as `bootstrap.servers` to `confluent_kafka.Producer`. - :param partitioner: A function to be used to determine the outgoing message - partition. - Available values: "random", "consistent_random", "murmur2", "murmur2_random", - "fnv1a", "fnv1a_random" - Default - "murmur2". - :param extra_config: A dictionary with additional options that - will be passed to `confluent_kafka.Producer` as is. - Note: values passed as arguments override values in `extra_config`. - :param on_error: a callback triggered when `RowProducer.produce_row()` - or `RowProducer.poll()` fail`. - If producer fails and the callback returns `True`, the exception - will be logged but not propagated. - The default callback logs an exception and returns `False`. +Can be called outside the `Consumer` `on_assign` callback (multiple times). +Partitions immediately show on `Consumer.assignment()`. +Any additional partitions besides the ones passed during the `Consumer` +`on_assign` callback will NOT be associated with the consumer group. - + -#### RowProducer.produce\_row +#### Consumer.incremental\_unassign ```python -def produce_row(row: Row, - topic: Topic, - key: Optional[Any] = None, - partition: Optional[int] = None, - timestamp: Optional[int] = None) +def incremental_unassign(partitions: List[TopicPartition]) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/rowproducer.py#L65) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L522) -Serialize Row to bytes according to the Topic serialization settings +Revoke partitions. -and produce it to Kafka +Can be called outside an on_revoke callback. -If this method fails, it will trigger the provided "on_error" callback. + -**Arguments**: +#### Consumer.close -- `row`: Row object -- `topic`: Topic object -- `key`: message key, optional -- `partition`: partition number, optional -- `timestamp`: timestamp in milliseconds, optional +```python +def close() +``` - +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/kafka/consumer.py#L530) -#### RowProducer.poll +Close down and terminate the Kafka Consumer. -```python -def poll(timeout: float = None) -``` +Actions performed: -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/rowproducer.py#L102) +- Stops consuming. +- Commits offsets, unless the consumer property 'enable.auto.commit' is set to False. +- Leaves the consumer group. -Polls the producer for events and calls `on_delivery` callbacks. +Registered callbacks may be called from this method, +see `poll()` for more info. -If poll fails, it will trigger the provided "on_error" callback -**Arguments**: + -- `timeout`: timeout in seconds +## quixstreams.kafka.exceptions @@ -6950,7 +7240,7 @@ If poll fails, it will trigger the provided "on_error" callback class Application() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/app.py#L55) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/app.py#L59) The main Application class. @@ -6996,7 +7286,7 @@ def __init__(broker_address: Optional[str] = None, quix_sdk_token: Optional[str] = None, consumer_group: Optional[str] = None, auto_offset_reset: AutoOffsetReset = "latest", - auto_commit_enable: bool = True, + commit_interval: float = 5.0, partitioner: Partitioner = "murmur2", consumer_extra_config: Optional[dict] = None, producer_extra_config: Optional[dict] = None, @@ -7015,7 +7305,7 @@ def __init__(broker_address: Optional[str] = None, topic_manager: Optional[TopicManager] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/app.py#L93) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/app.py#L97) **Arguments**: @@ -7034,9 +7324,9 @@ Passed as `group.id` to `confluent_kafka.Consumer`. Linked Environment Variable: `Quix__Consumer__Group`. Default - "quixstreams-default" (set during init) >***NOTE:*** Quix Applications will prefix it with the Quix workspace id. +- `commit_interval`: How often to commit the processed messages in seconds. +Default - 5.0. - `auto_offset_reset`: Consumer `auto.offset.reset` setting -- `auto_commit_enable`: If true, periodically commit offset of -the last message handed to the application. Default - `True`. - `partitioner`: A function to be used to determine the outgoing message partition. - `consumer_extra_config`: A dictionary with additional options that @@ -7087,7 +7377,6 @@ instead of the default one. def Quix(cls, consumer_group: Optional[str] = None, auto_offset_reset: AutoOffsetReset = "latest", - auto_commit_enable: bool = True, partitioner: Partitioner = "murmur2", consumer_extra_config: Optional[dict] = None, producer_extra_config: Optional[dict] = None, @@ -7106,7 +7395,7 @@ def Quix(cls, topic_manager: Optional[QuixTopicManager] = None) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/app.py#L296) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/app.py#L303) >***NOTE:*** DEPRECATED: use Application with `quix_sdk_token` argument instead. @@ -7150,8 +7439,6 @@ Linked Environment Variable: `Quix__Consumer__Group`. Default - "quixstreams-default" (set during init). >***NOTE:*** Quix Applications will prefix it with the Quix workspace id. - `auto_offset_reset`: Consumer `auto.offset.reset` setting -- `auto_commit_enable`: If true, periodically commit offset of -the last message handed to the application. Default - `True`. - `partitioner`: A function to be used to determine the outgoing message partition. - `consumer_extra_config`: A dictionary with additional options that @@ -7210,7 +7497,7 @@ def topic(name: str, timestamp_extractor: Optional[TimestampExtractor] = None) -> Topic ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/app.py#L436) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/app.py#L439) Create a topic definition. @@ -7281,7 +7568,7 @@ topic = app.topic("input-topic", timestamp_extractor=custom_ts_extractor) def dataframe(topic: Topic) -> StreamingDataFrame ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/app.py#L516) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/app.py#L519) A simple helper method that generates a `StreamingDataFrame`, which is used @@ -7320,10 +7607,10 @@ to be used as an input topic. #### Application.stop ```python -def stop() +def stop(fail: bool = False) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/app.py#L552) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/app.py#L556) Stop the internal poll loop and the message processing. @@ -7333,6 +7620,11 @@ likely through some sort of threading). To otherwise stop an application, either send a `SIGTERM` to the process (like Kubernetes does) or perform a typical `KeyboardInterrupt` (`Ctrl+C`). +**Arguments**: + +- `fail`: if True, signals that application is stopped due +to unhandled exception, and it shouldn't commit the current checkpoint. + #### Application.get\_producer @@ -7341,7 +7633,7 @@ To otherwise stop an application, either send a `SIGTERM` to the process def get_producer() -> Producer ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/app.py#L566) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/app.py#L579) Create and return a pre-configured Producer instance. The Producer is initialized with params passed to Application. @@ -7369,10 +7661,10 @@ with app.get_producer() as producer: #### Application.get\_consumer ```python -def get_consumer() -> Consumer +def get_consumer(auto_commit_enable: bool = True) -> Consumer ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/app.py#L597) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/app.py#L610) Create and return a pre-configured Consumer instance. The Consumer is initialized with params passed to Application. @@ -7413,7 +7705,7 @@ with app.get_consumer() as consumer: def clear_state() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/app.py#L641) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/app.py#L654) Clear the state of the application. @@ -7425,11 +7717,11 @@ Clear the state of the application. def run(dataframe: StreamingDataFrame) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/app.py#L719) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/app.py#L660) Start processing data from Kafka using provided `StreamingDataFrame` -One started, can be safely terminated with a `SIGTERM` signal +Once started, it can be safely terminated with a `SIGTERM` signal (like Kubernetes does) or a typical `KeyboardInterrupt` (`Ctrl+C`). @@ -7453,3 +7745,235 @@ app.run(dataframe=df) - `dataframe`: instance of `StreamingDataFrame` + + +## quixstreams.rowconsumer + + + +### RowConsumer + +```python +class RowConsumer(Consumer) +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/rowconsumer.py#L19) + + + +#### RowConsumer.\_\_init\_\_ + +```python +def __init__(broker_address: str, + consumer_group: str, + auto_offset_reset: AutoOffsetReset, + auto_commit_enable: bool = True, + assignment_strategy: AssignmentStrategy = "range", + on_commit: Callable[[Optional[KafkaError], List[TopicPartition]], + None] = None, + extra_config: Optional[dict] = None, + on_error: Optional[ConsumerErrorCallback] = None) +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/rowconsumer.py#L20) + +A consumer class that is capable of deserializing Kafka messages to Rows + +according to the Topics deserialization settings. + +It overrides `.subscribe()` method of Consumer class to accept `Topic` +objects instead of strings. + +**Arguments**: + +- `broker_address`: Kafka broker host and port in format `:`. +Passed as `bootstrap.servers` to `confluent_kafka.Consumer`. +- `consumer_group`: Kafka consumer group. +Passed as `group.id` to `confluent_kafka.Consumer` +- `auto_offset_reset`: Consumer `auto.offset.reset` setting. +Available values: +- "earliest" - automatically reset the offset to the smallest offset +- "latest" - automatically reset the offset to the largest offset +- `auto_commit_enable`: If true, periodically commit offset of +the last message handed to the application. Default - `True`. +- `assignment_strategy`: The name of a partition assignment strategy. +Available values: "range", "roundrobin", "cooperative-sticky". +- `on_commit`: Offset commit result propagation callback. +Passed as "offset_commit_cb" to `confluent_kafka.Consumer`. +- `extra_config`: A dictionary with additional options that +will be passed to `confluent_kafka.Consumer` as is. +Note: values passed as arguments override values in `extra_config`. +- `on_error`: a callback triggered when `RowConsumer.poll_row` fails. +If consumer fails and the callback returns `True`, the exception +will be logged but not propagated. +The default callback logs an exception and returns `False`. + + + +#### RowConsumer.subscribe + +```python +def subscribe(topics: List[Topic], + on_assign: Optional[RebalancingCallback] = None, + on_revoke: Optional[RebalancingCallback] = None, + on_lost: Optional[RebalancingCallback] = None) +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/rowconsumer.py#L75) + +Set subscription to supplied list of topics. + +This replaces a previous subscription. + +This method also updates the internal mapping with topics that is used +to deserialize messages to Rows. + +**Arguments**: + +- `topics`: list of `Topic` instances to subscribe to. +- `on_assign` (`callable`): callback to provide handling of customized offsets +on completion of a successful partition re-assignment. +- `on_revoke` (`callable`): callback to provide handling of offset commits to +a customized store on the start of a rebalance operation. +- `on_lost` (`callable`): callback to provide handling in the case the partition +assignment has been lost. Partitions that have been lost may already be +owned by other members in the group and therefore committing offsets, +for example, may fail. + + + +#### RowConsumer.poll\_row + +```python +def poll_row(timeout: float = None) -> Union[Row, List[Row], None] +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/rowconsumer.py#L109) + +Consumes a single message and deserialize it to Row or a list of Rows. + +The message is deserialized according to the corresponding Topic. +If deserializer raises `IgnoreValue` exception, this method will return None. +If Kafka returns an error, it will be raised as exception. + +**Arguments**: + +- `timeout`: poll timeout seconds + +**Returns**: + +single Row, list of Rows or None + + + +## quixstreams.checkpointing.checkpoint + + + +### Checkpoint + +```python +class Checkpoint() +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/checkpointing/checkpoint.py#L20) + +Class to keep track of state updates and consumer offsets and to checkpoint these +updates on schedule. + + + +#### Checkpoint.expired + +```python +def expired() -> bool +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/checkpointing/checkpoint.py#L45) + +Returns `True` if checkpoint deadline has expired. + + + +#### Checkpoint.empty + +```python +def empty() -> bool +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/checkpointing/checkpoint.py#L51) + +Returns `True` if checkpoint doesn't have any offsets stored yet. + + + + +#### Checkpoint.store\_offset + +```python +def store_offset(topic: str, partition: int, offset: int) +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/checkpointing/checkpoint.py#L58) + +Store the offset of the processed message to the checkpoint. + +**Arguments**: + +- `topic`: topic name +- `partition`: partition number +- `offset`: message offset + + + +#### Checkpoint.get\_store\_transaction + +```python +def get_store_transaction( + topic: str, + partition: int, + store_name: str = DEFAULT_STATE_STORE_NAME) -> PartitionTransaction +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/checkpointing/checkpoint.py#L78) + +Get a PartitionTransaction for the given store, topic and partition. + +It will return already started transaction if there's one. + +**Arguments**: + +- `topic`: topic name +- `partition`: partition number +- `store_name`: store name + +**Returns**: + +instance of `PartitionTransaction` + + + +#### Checkpoint.commit + +```python +def commit() +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/checkpointing/checkpoint.py#L101) + +Commit the checkpoint. + +This method will: + 1. Produce the changelogs for each state store + 2. Flush the producer to ensure everything is delivered. + 3. Commit topic offsets. + 4. Flush each state store partition to the disk. + + + +## quixstreams.checkpointing + + + +## quixstreams.checkpointing.exceptions + diff --git a/docs/api-reference/serialization.md b/docs/api-reference/serialization.md index 5c40047f3..691daf043 100644 --- a/docs/api-reference/serialization.md +++ b/docs/api-reference/serialization.md @@ -10,7 +10,7 @@ class QuixDeserializer(JSONDeserializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/quix.py#L73) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/quix.py#L73) Handles Deserialization for any Quix-formatted topic. @@ -27,7 +27,7 @@ def __init__(column_name: Optional[str] = None, loads: Callable[[Union[bytes, bytearray]], Any] = default_loads) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/quix.py#L80) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/quix.py#L80)
@@ -49,7 +49,7 @@ Default - :py:func:`quixstreams.utils.json.loads`. def split_values() -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/quix.py#L100) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/quix.py#L100) Each Quix message might contain data for multiple Rows. This property informs the downstream processors about that, so they can @@ -66,7 +66,7 @@ def deserialize(model_key: str, value: Union[List[Mapping], Mapping]) -> Iterable[Mapping] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/quix.py#L153) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/quix.py#L153) Deserialization function for particular data types (Timeseries or EventData). @@ -91,7 +91,7 @@ Iterable of dicts class QuixTimeseriesSerializer(QuixSerializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/quix.py#L321) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/quix.py#L321) Serialize data to JSON formatted according to Quix Timeseries format. @@ -123,7 +123,7 @@ Output: class QuixEventsSerializer(QuixSerializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/quix.py#L409) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/quix.py#L409) Serialize data to JSON formatted according to Quix EventData format. The input value is expected to be a dictionary with the following keys: @@ -164,7 +164,7 @@ Output: class BytesDeserializer(Deserializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/simple_types.py#L44) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/simple_types.py#L44) A deserializer to bypass bytes without any changes @@ -176,7 +176,7 @@ A deserializer to bypass bytes without any changes class BytesSerializer(Serializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/simple_types.py#L55) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/simple_types.py#L55) A serializer to bypass bytes without any changes @@ -188,7 +188,7 @@ A serializer to bypass bytes without any changes class StringDeserializer(Deserializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/simple_types.py#L64) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/simple_types.py#L64) @@ -200,7 +200,7 @@ class StringDeserializer(Deserializer) def __init__(column_name: Optional[str] = None, codec: str = "utf_8") ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/simple_types.py#L65) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/simple_types.py#L65) Deserializes bytes to strings using the specified encoding. @@ -219,7 +219,7 @@ A wrapper around `confluent_kafka.serialization.StringDeserializer`. class IntegerDeserializer(Deserializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/simple_types.py#L84) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/simple_types.py#L84) Deserializes bytes to integers. @@ -233,7 +233,7 @@ A wrapper around `confluent_kafka.serialization.IntegerDeserializer`. class DoubleDeserializer(Deserializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/simple_types.py#L103) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/simple_types.py#L103) Deserializes float to IEEE 764 binary64. @@ -247,7 +247,7 @@ A wrapper around `confluent_kafka.serialization.DoubleDeserializer`. class StringSerializer(Serializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/simple_types.py#L122) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/simple_types.py#L122) @@ -259,7 +259,7 @@ class StringSerializer(Serializer) def __init__(codec: str = "utf_8") ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/simple_types.py#L123) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/simple_types.py#L123) Serializes strings to bytes using the specified encoding. @@ -277,7 +277,7 @@ Serializes strings to bytes using the specified encoding. class IntegerSerializer(Serializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/simple_types.py#L135) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/simple_types.py#L135) Serializes integers to bytes @@ -289,7 +289,7 @@ Serializes integers to bytes class DoubleSerializer(Serializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/serializers/simple_types.py#L148) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/serializers/simple_types.py#L148) Serializes floats to bytes diff --git a/docs/api-reference/state.md b/docs/api-reference/state.md index df6a283de..b9cf51c76 100644 --- a/docs/api-reference/state.md +++ b/docs/api-reference/state.md @@ -10,7 +10,7 @@ class State(Protocol) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L151) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L136) Primary interface for working with key-value state data from `StreamingDataFrame` @@ -24,7 +24,7 @@ Primary interface for working with key-value state data from `StreamingDataFrame def get(key: Any, default: Any = None) -> Optional[Any] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L156) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L141) Get the value for key if key is present in the state, else default @@ -51,7 +51,7 @@ value or None if the key is not found and `default` is not provided def set(key: Any, value: Any) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L166) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L151) Set value for the key. @@ -72,7 +72,7 @@ Set value for the key. def delete(key: Any) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L174) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L159) Delete value for the key. @@ -94,7 +94,7 @@ This function always returns `None`, even if value is not found. def exists(key: Any) -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/types.py#L183) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/types.py#L168) Check if the key exists in state. @@ -123,7 +123,7 @@ True if key exists, False otherwise class RocksDBOptions(RocksDBOptionsType) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/options.py#L25) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/options.py#L25) RocksDB database options. @@ -148,7 +148,7 @@ Please see `rocksdict.Options` for a complete description of other options. def to_options() -> rocksdict.Options ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/state/rocksdb/options.py#L53) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/state/rocksdb/options.py#L53) Convert parameters to `rocksdict.Options` diff --git a/docs/api-reference/topics.md b/docs/api-reference/topics.md index fa90bd1fa..22c500f84 100644 --- a/docs/api-reference/topics.md +++ b/docs/api-reference/topics.md @@ -16,7 +16,7 @@ def convert_topic_list(topics: List[Topic]) -> List[ConfluentTopic] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/admin.py#L23) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/admin.py#L23) Converts `Topic`s to `ConfluentTopic`s as required for Confluent's @@ -42,7 +42,7 @@ list of confluent_kafka `ConfluentTopic`s class TopicAdmin() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/admin.py#L46) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/admin.py#L46) For performing "admin"-level operations on a Kafka cluster, mostly around topics. @@ -58,7 +58,7 @@ Primarily used to create and inspect topic configurations. def __init__(broker_address: str, extra_config: Optional[Mapping] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/admin.py#L53) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/admin.py#L53)
@@ -77,7 +77,7 @@ def __init__(broker_address: str, extra_config: Optional[Mapping] = None) def list_topics() -> Dict[str, ConfluentTopicMetadata] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/admin.py#L74) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/admin.py#L74) Get a list of topics and their metadata from a Kafka cluster @@ -97,7 +97,7 @@ a dict of topic names and their metadata objects def inspect_topics(topic_names: List[str]) -> Dict[str, Optional[TopicConfig]] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/admin.py#L83) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/admin.py#L83) A simplified way of getting the topic configurations of the provided topics @@ -127,7 +127,7 @@ def create_topics(topics: List[Topic], finalize_timeout: int = 60) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/admin.py#L156) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/admin.py#L156) Create the given list of topics and confirm they are ready. @@ -155,7 +155,7 @@ fail (it ignores issues for a topic already existing). class TopicConfig() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/topic.py#L43) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/topic.py#L43) Represents all kafka-level configuration for a kafka topic. @@ -169,7 +169,7 @@ Generally used by Topic and any topic creation procedures. class Topic() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/topic.py#L84) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/topic.py#L84) A definition of a Kafka topic. @@ -194,7 +194,7 @@ def __init__( timestamp_extractor: Optional[TimestampExtractor] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/topic.py#L93) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/topic.py#L93)
@@ -220,7 +220,7 @@ milliseconds from a deserialized message. def name() -> str ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/topic.py#L122) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/topic.py#L122) Topic name @@ -234,7 +234,7 @@ Topic name def row_serialize(row: Row, key: Optional[Any] = None) -> KafkaMessage ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/topic.py#L132) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/topic.py#L132) Serialize Row to a Kafka message structure @@ -262,7 +262,7 @@ def row_deserialize( message: ConfluentKafkaMessageProto) -> Union[Row, List[Row], None] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/topic.py#L155) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/topic.py#L155) Deserialize incoming Kafka message to a Row. @@ -292,7 +292,7 @@ Row, list of Rows or None if the message is ignored. def affirm_ready_for_create(topics: List[Topic]) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/manager.py#L19) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/manager.py#L19) Validate a list of topics is ready for creation attempt @@ -310,7 +310,7 @@ Validate a list of topics is ready for creation attempt class TopicManager() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/manager.py#L29) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/manager.py#L29) The source of all topic management with quixstreams. @@ -330,7 +330,7 @@ See methods for details. def __init__(topic_admin: TopicAdmin, create_timeout: int = 60) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/manager.py#L48) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/manager.py#L48)
@@ -350,7 +350,7 @@ def __init__(topic_admin: TopicAdmin, create_timeout: int = 60) def changelog_topics() -> Dict[str, Dict[str, Topic]] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/manager.py#L71) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/manager.py#L71) Note: `Topic`s are the changelogs. @@ -368,7 +368,7 @@ def topic_config(num_partitions: Optional[int] = None, extra_config: Optional[dict] = None) -> TopicConfig ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/manager.py#L121) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/manager.py#L119) Convenience method for generating a `TopicConfig` with default settings @@ -402,7 +402,7 @@ def topic(name: str, timestamp_extractor: Optional[TimestampExtractor] = None) -> Topic ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/manager.py#L142) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/manager.py#L140) A convenience method for generating a `Topic`. Will use default config options @@ -438,7 +438,7 @@ def changelog_topic(topic_name: str, store_name: str, consumer_group: str) -> Topic ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/manager.py#L191) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/manager.py#L189) Performs all the logic necessary to generate a changelog topic based on a @@ -483,7 +483,7 @@ generate changelog topics. To turn off changelogs, init an Application with def create_topics(topics: List[Topic]) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/manager.py#L262) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/manager.py#L260) Creates topics via an explicit list of provided `Topics`. @@ -506,7 +506,7 @@ Exists as a way to manually specify what topics to create; otherwise, def create_all_topics() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/manager.py#L277) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/manager.py#L275) A convenience method to create all Topic objects stored on this TopicManager. @@ -520,7 +520,7 @@ A convenience method to create all Topic objects stored on this TopicManager. def validate_all_topics() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/356d83c8caf613065f333dcd470e004443c12544/quixstreams/models/topics/manager.py#L283) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/3787271060f892c91ec0aef934bfd09c55790e92/quixstreams/models/topics/manager.py#L281) Validates all topics exist and changelogs have correct topic and rep factor.