API refactor

2025-10-07 16:25:52 +09:00
parent 76d0d86211
commit 91c7e04474
1171 changed files with 81940 additions and 44117 deletions
--- a/venv/lib/python3.12/site-packages/kafka/producer/buffer.py
+++ b/venv/lib/python3.12/site-packages/kafka/producer/buffer.py
@@ -1,115 +0,0 @@
-from __future__ import absolute_import, division
-
-import collections
-import io
-import threading
-import time
-
-from kafka.metrics.stats import Rate
-
-import kafka.errors as Errors
-
-
-class SimpleBufferPool(object):
-    """A simple pool of BytesIO objects with a weak memory ceiling."""
-    def __init__(self, memory, poolable_size, metrics=None, metric_group_prefix='producer-metrics'):
-        """Create a new buffer pool.
-
-        Arguments:
-            memory (int): maximum memory that this buffer pool can allocate
-            poolable_size (int): memory size per buffer to cache in the free
-                list rather than deallocating
-        """
-        self._poolable_size = poolable_size
-        self._lock = threading.RLock()
-
-        buffers = int(memory / poolable_size) if poolable_size else 0
-        self._free = collections.deque([io.BytesIO() for _ in range(buffers)])
-
-        self._waiters = collections.deque()
-        self.wait_time = None
-        if metrics:
-            self.wait_time = metrics.sensor('bufferpool-wait-time')
-            self.wait_time.add(metrics.metric_name(
-                'bufferpool-wait-ratio', metric_group_prefix,
-                'The fraction of time an appender waits for space allocation.'),
-                Rate())
-
-    def allocate(self, size, max_time_to_block_ms):
-        """
-        Allocate a buffer of the given size. This method blocks if there is not
-        enough memory and the buffer pool is configured with blocking mode.
-
-        Arguments:
-            size (int): The buffer size to allocate in bytes [ignored]
-            max_time_to_block_ms (int): The maximum time in milliseconds to
-                block for buffer memory to be available
-
-        Returns:
-            io.BytesIO
-        """
-        with self._lock:
-            # check if we have a free buffer of the right size pooled
-            if self._free:
-                return self._free.popleft()
-
-            elif self._poolable_size == 0:
-                return io.BytesIO()
-
-            else:
-                # we are out of buffers and will have to block
-                buf = None
-                more_memory = threading.Condition(self._lock)
-                self._waiters.append(more_memory)
-                # loop over and over until we have a buffer or have reserved
-                # enough memory to allocate one
-                while buf is None:
-                    start_wait = time.time()
-                    more_memory.wait(max_time_to_block_ms / 1000.0)
-                    end_wait = time.time()
-                    if self.wait_time:
-                        self.wait_time.record(end_wait - start_wait)
-
-                    if self._free:
-                        buf = self._free.popleft()
-                    else:
-                        self._waiters.remove(more_memory)
-                        raise Errors.KafkaTimeoutError(
-                            "Failed to allocate memory within the configured"
-                            " max blocking time")
-
-                # remove the condition for this thread to let the next thread
-                # in line start getting memory
-                removed = self._waiters.popleft()
-                assert removed is more_memory, 'Wrong condition'
-
-                # signal any additional waiters if there is more memory left
-                # over for them
-                if self._free and self._waiters:
-                    self._waiters[0].notify()
-
-                # unlock and return the buffer
-                return buf
-
-    def deallocate(self, buf):
-        """
-        Return buffers to the pool. If they are of the poolable size add them
-        to the free list, otherwise just mark the memory as free.
-
-        Arguments:
-            buffer_ (io.BytesIO): The buffer to return
-        """
-        with self._lock:
-            # BytesIO.truncate here makes the pool somewhat pointless
-            # but we stick with the BufferPool API until migrating to
-            # bytesarray / memoryview. The buffer we return must not
-            # expose any prior data on read().
-            buf.truncate(0)
-            self._free.append(buf)
-            if self._waiters:
-                self._waiters[0].notify()
-
-    def queued(self):
-        """The number of threads blocked waiting on memory."""
-        with self._lock:
-            return len(self._waiters)
--- a/venv/lib/python3.12/site-packages/kafka/producer/future.py
+++ b/venv/lib/python3.12/site-packages/kafka/producer/future.py
@@ -38,7 +38,7 @@ class FutureRecordMetadata(Future):
        produce_future.add_errback(self.failure)

    def _produce_success(self, offset_and_timestamp):
-        offset, produce_timestamp_ms, log_start_offset = offset_and_timestamp
+        offset, produce_timestamp_ms = offset_and_timestamp

        # Unpacking from args tuple is minor speed optimization
        (relative_offset, timestamp_ms, checksum,
@@ -51,7 +51,7 @@ class FutureRecordMetadata(Future):
        if offset != -1 and relative_offset is not None:
            offset += relative_offset
        tp = self._produce_future.topic_partition
-        metadata = RecordMetadata(tp[0], tp[1], tp, offset, timestamp_ms, log_start_offset,
+        metadata = RecordMetadata(tp[0], tp[1], tp, offset, timestamp_ms,
                                  checksum, serialized_key_size,
                                  serialized_value_size, serialized_header_size)
        self.success(metadata)
@@ -67,5 +67,5 @@ class FutureRecordMetadata(Future):


 RecordMetadata = collections.namedtuple(
-    'RecordMetadata', ['topic', 'partition', 'topic_partition', 'offset', 'timestamp', 'log_start_offset',
+    'RecordMetadata', ['topic', 'partition', 'topic_partition', 'offset', 'timestamp',
                       'checksum', 'serialized_key_size', 'serialized_value_size', 'serialized_header_size'])
--- a/venv/lib/python3.12/site-packages/kafka/producer/kafka.py
+++ b/venv/lib/python3.12/site-packages/kafka/producer/kafka.py
@@ -1,11 +1,11 @@
-from __future__ import absolute_import
+from __future__ import absolute_import, division

 import atexit
 import copy
 import logging
 import socket
 import threading
-import time
+import warnings
 import weakref

 from kafka.vendor import six
@@ -18,10 +18,12 @@ from kafka.partitioner.default import DefaultPartitioner
 from kafka.producer.future import FutureRecordMetadata, FutureProduceResult
 from kafka.producer.record_accumulator import AtomicInteger, RecordAccumulator
 from kafka.producer.sender import Sender
+from kafka.producer.transaction_manager import TransactionManager
 from kafka.record.default_records import DefaultRecordBatchBuilder
 from kafka.record.legacy_records import LegacyRecordBatchBuilder
 from kafka.serializer import Serializer
 from kafka.structs import TopicPartition
+from kafka.util import Timer, ensure_valid_topic_name


 log = logging.getLogger(__name__)
@@ -34,8 +36,8 @@ class KafkaProducer(object):
    The producer is thread safe and sharing a single producer instance across
    threads will generally be faster than having multiple instances.

-    The producer consists of a pool of buffer space that holds records that
-    haven't yet been transmitted to the server as well as a background I/O
+    The producer consists of a RecordAccumulator which holds records that
+    haven't yet been transmitted to the server, and a Sender background I/O
    thread that is responsible for turning these records into requests and
    transmitting them to the cluster.

@@ -71,14 +73,50 @@ class KafkaProducer(object):
    can lead to fewer, more efficient requests when not under maximal load at
    the cost of a small amount of latency.

-    The buffer_memory controls the total amount of memory available to the
-    producer for buffering. If records are sent faster than they can be
-    transmitted to the server then this buffer space will be exhausted. When
-    the buffer space is exhausted additional send calls will block.
-
    The key_serializer and value_serializer instruct how to turn the key and
    value objects the user provides into bytes.

+    From Kafka 0.11, the KafkaProducer supports two additional modes:
+    the idempotent producer and the transactional producer.
+    The idempotent producer strengthens Kafka's delivery semantics from
+    at least once to exactly once delivery. In particular, producer retries
+    will no longer introduce duplicates. The transactional producer allows an
+    application to send messages to multiple partitions (and topics!)
+    atomically.
+
+    To enable idempotence, the `enable_idempotence` configuration must be set
+    to True. If set, the `retries` config will default to `float('inf')` and
+    the `acks` config will default to 'all'. There are no API changes for the
+    idempotent producer, so existing applications will not need to be modified
+    to take advantage of this feature.
+
+    To take advantage of the idempotent producer, it is imperative to avoid
+    application level re-sends since these cannot be de-duplicated. As such, if
+    an application enables idempotence, it is recommended to leave the
+    `retries` config unset, as it will be defaulted to `float('inf')`.
+    Additionally, if a :meth:`~kafka.KafkaProducer.send` returns an error even
+    with infinite retries (for instance if the message expires in the buffer
+    before being sent), then it is recommended to shut down the producer and
+    check the contents of the last produced message to ensure that it is not
+    duplicated. Finally, the producer can only guarantee idempotence for
+    messages sent within a single session.
+
+    To use the transactional producer and the attendant APIs, you must set the
+    `transactional_id` configuration property. If the `transactional_id` is
+    set, idempotence is automatically enabled along with the producer configs
+    which idempotence depends on. Further, topics which are included in
+    transactions should be configured for durability. In particular, the
+    `replication.factor` should be at least `3`, and the `min.insync.replicas`
+    for these topics should be set to 2. Finally, in order for transactional
+    guarantees to be realized from end-to-end, the consumers must be
+    configured to read only committed messages as well.
+
+    The purpose of the `transactional_id` is to enable transaction recovery
+    across multiple sessions of a single producer instance. It would typically
+    be derived from the shard identifier in a partitioned, stateful,
+    application. As such, it should be unique to each producer instance running
+    within a partitioned application.
+
    Keyword Arguments:
        bootstrap_servers: 'host[:port]' string (or list of 'host[:port]'
            strings) that the producer should contact to bootstrap initial
@@ -96,6 +134,28 @@ class KafkaProducer(object):
        value_serializer (callable): used to convert user-supplied message
            values to bytes. If not None, called as f(value), should return
            bytes. Default: None.
+        enable_idempotence (bool): When set to True, the producer will ensure
+            that exactly one copy of each message is written in the stream.
+            If False, producer retries due to broker failures, etc., may write
+            duplicates of the retried message in the stream. Default: False.
+
+            Note that enabling idempotence requires
+            `max_in_flight_requests_per_connection` to be set to 1 and `retries`
+            cannot be zero. Additionally, `acks` must be set to 'all'. If these
+            values are left at their defaults, the producer will override the
+            defaults to be suitable. If the values are set to something
+            incompatible with the idempotent producer, a KafkaConfigurationError
+            will be raised.
+        delivery_timeout_ms (float): An upper bound on the time to report success
+            or failure after producer.send() returns. This limits the total time
+            that a record will be delayed prior to sending, the time to await
+            acknowledgement from the broker (if expected), and the time allowed
+            for retriable send failures. The producer may report failure to send
+            a record earlier than this config if either an unrecoverable error is
+            encountered, the retries have been exhausted, or the record is added
+            to a batch which reached an earlier delivery expiration deadline.
+            The value of this config should be greater than or equal to the
+            sum of (request_timeout_ms + linger_ms). Default: 120000.
        acks (0, 1, 'all'): The number of acknowledgments the producer requires
            the leader to have received before considering a request complete.
            This controls the durability of records that are sent. The
@@ -123,7 +183,7 @@ class KafkaProducer(object):
            Compression is of full batches of data, so the efficacy of batching
            will also impact the compression ratio (more batching means better
            compression). Default: None.
-        retries (int): Setting a value greater than zero will cause the client
+        retries (numeric): Setting a value greater than zero will cause the client
            to resend any record whose send fails with a potentially transient
            error. Note that this retry is no different than if the client
            resent the record upon receiving the error. Allowing retries
@@ -131,8 +191,12 @@ class KafkaProducer(object):
            potentially change the ordering of records because if two batches
            are sent to a single partition, and the first fails and is retried
            but the second succeeds, then the records in the second batch may
-            appear first.
-            Default: 0.
+            appear first. Note additionally that produce requests will be
+            failed before the number of retries has been exhausted if the timeout
+            configured by delivery_timeout_ms expires first before successful
+            acknowledgement. Users should generally prefer to leave this config
+            unset and instead use delivery_timeout_ms to control retry behavior.
+            Default: float('inf') (infinite)
        batch_size (int): Requests sent to brokers will contain multiple
            batches, one for each partition with data available to be sent.
            A small batch size will make batching less common and may reduce
@@ -165,12 +229,6 @@ class KafkaProducer(object):
            messages with the same key are assigned to the same partition.
            When a key is None, the message is delivered to a random partition
            (filtered to partitions with available leaders only, if possible).
-        buffer_memory (int): The total bytes of memory the producer should use
-            to buffer records waiting to be sent to the server. If records are
-            sent faster than they can be delivered to the server the producer
-            will block up to max_block_ms, raising an exception on timeout.
-            In the current implementation, this setting is an approximation.
-            Default: 33554432 (32MB)
        connections_max_idle_ms: Close idle connections after the number of
            milliseconds specified by this config. The broker closes idle
            connections after connections.max.idle.ms, so this avoids hitting
@@ -188,6 +246,9 @@ class KafkaProducer(object):
            This setting will limit the number of record batches the producer
            will send in a single request to avoid sending huge requests.
            Default: 1048576.
+        allow_auto_create_topics (bool): Enable/disable auto topic creation
+            on metadata request. Only available with api_version >= (0, 11).
+            Default: True
        metadata_max_age_ms (int): The period of time in milliseconds after
            which we force a refresh of metadata even if we haven't seen any
            partition leadership changes to proactively discover any new
@@ -216,7 +277,7 @@ class KafkaProducer(object):
            reconnection attempts will continue periodically with this fixed
            rate. To avoid connection storms, a randomization factor of 0.2
            will be applied to the backoff resulting in a random range between
-            20% below and 20% above the computed value. Default: 1000.
+            20% below and 20% above the computed value. Default: 30000.
        max_in_flight_requests_per_connection (int): Requests are pipelined
            to kafka brokers up to this number of maximum requests per
            broker connection. Note that if this setting is set to be greater
@@ -233,7 +294,7 @@ class KafkaProducer(object):
            should verify that the certificate matches the brokers hostname.
            default: true.
        ssl_cafile (str): optional filename of ca file to use in certificate
-            veriication. default: none.
+            verification. default: none.
        ssl_certfile (str): optional filename of file in pem format containing
            the client certificate, as well as any ca certificates needed to
            establish the certificate's authenticity. default: none.
@@ -252,14 +313,28 @@ class KafkaProducer(object):
            or other configuration forbids use of all the specified ciphers),
            an ssl.SSLError will be raised. See ssl.SSLContext.set_ciphers
        api_version (tuple): Specify which Kafka API version to use. If set to
-            None, the client will attempt to infer the broker version by probing
-            various APIs. Example: (0, 10, 2). Default: None
+            None, the client will attempt to determine the broker version via
+            ApiVersionsRequest API or, for brokers earlier than 0.10, probing
+            various known APIs. Dynamic version checking is performed eagerly
+            during __init__ and can raise NoBrokersAvailableError if no connection
+            was made before timeout (see api_version_auto_timeout_ms below).
+            Different versions enable different functionality.
+
+            Examples:
+                (3, 9) most recent broker release, enable all supported features
+                (0, 11) enables message format v2 (internal)
+                (0, 10, 0) enables sasl authentication and message format v1
+                (0, 8, 0) enables basic functionality only
+
+            Default: None
        api_version_auto_timeout_ms (int): number of milliseconds to throw a
            timeout exception from the constructor when checking the broker
            api version. Only applies if api_version set to None.
+            Default: 2000
        metric_reporters (list): A list of classes to use as metrics reporters.
            Implementing the AbstractMetricsReporter interface allows plugging
            in classes that will be notified of new metric creation. Default: []
+        metrics_enabled (bool): Whether to track metrics on this instance. Default True.
        metrics_num_samples (int): The number of samples maintained to compute
            metrics. Default: 2
        metrics_sample_window_ms (int): The maximum age in milliseconds of
@@ -274,33 +349,42 @@ class KafkaProducer(object):
            Required if sasl_mechanism is PLAIN or one of the SCRAM mechanisms.
        sasl_plain_password (str): password for sasl PLAIN and SCRAM authentication.
            Required if sasl_mechanism is PLAIN or one of the SCRAM mechanisms.
+        sasl_kerberos_name (str or gssapi.Name): Constructed gssapi.Name for use with
+            sasl mechanism handshake. If provided, sasl_kerberos_service_name and
+            sasl_kerberos_domain name are ignored. Default: None.
        sasl_kerberos_service_name (str): Service name to include in GSSAPI
            sasl mechanism handshake. Default: 'kafka'
        sasl_kerberos_domain_name (str): kerberos domain name to use in GSSAPI
            sasl mechanism handshake. Default: one of bootstrap servers
-        sasl_oauth_token_provider (AbstractTokenProvider): OAuthBearer token provider
-            instance. (See kafka.oauth.abstract). Default: None
+        sasl_oauth_token_provider (kafka.sasl.oauth.AbstractTokenProvider): OAuthBearer
+            token provider instance. Default: None
+        socks5_proxy (str): Socks5 proxy URL. Default: None
+        kafka_client (callable): Custom class / callable for creating KafkaClient instances

    Note:
        Configuration parameters are described in more detail at
-        https://kafka.apache.org/0100/configuration.html#producerconfigs
+        https://kafka.apache.org/0100/documentation/#producerconfigs
    """
    DEFAULT_CONFIG = {
        'bootstrap_servers': 'localhost',
        'client_id': None,
        'key_serializer': None,
        'value_serializer': None,
+        'enable_idempotence': False,
+        'transactional_id': None,
+        'transaction_timeout_ms': 60000,
+        'delivery_timeout_ms': 120000,
        'acks': 1,
        'bootstrap_topics_filter': set(),
        'compression_type': None,
-        'retries': 0,
+        'retries': float('inf'),
        'batch_size': 16384,
        'linger_ms': 0,
        'partitioner': DefaultPartitioner(),
-        'buffer_memory': 33554432,
        'connections_max_idle_ms': 9 * 60 * 1000,
        'max_block_ms': 60000,
        'max_request_size': 1048576,
+        'allow_auto_create_topics': True,
        'metadata_max_age_ms': 300000,
        'retry_backoff_ms': 100,
        'request_timeout_ms': 30000,
@@ -310,7 +394,7 @@ class KafkaProducer(object):
        'sock_chunk_bytes': 4096,  # undocumented experimental option
        'sock_chunk_buffer_count': 1000,  # undocumented experimental option
        'reconnect_backoff_ms': 50,
-        'reconnect_backoff_max_ms': 1000,
+        'reconnect_backoff_max_ms': 30000,
        'max_in_flight_requests_per_connection': 5,
        'security_protocol': 'PLAINTEXT',
        'ssl_context': None,
@@ -324,17 +408,23 @@ class KafkaProducer(object):
        'api_version': None,
        'api_version_auto_timeout_ms': 2000,
        'metric_reporters': [],
+        'metrics_enabled': True,
        'metrics_num_samples': 2,
        'metrics_sample_window_ms': 30000,
        'selector': selectors.DefaultSelector,
        'sasl_mechanism': None,
        'sasl_plain_username': None,
        'sasl_plain_password': None,
+        'sasl_kerberos_name': None,
        'sasl_kerberos_service_name': 'kafka',
        'sasl_kerberos_domain_name': None,
-        'sasl_oauth_token_provider': None
+        'sasl_oauth_token_provider': None,
+        'socks5_proxy': None,
+        'kafka_client': KafkaClient,
    }

+    DEPRECATED_CONFIGS = ('buffer_memory',)
+
    _COMPRESSORS = {
        'gzip': (has_gzip, LegacyRecordBatchBuilder.CODEC_GZIP),
        'snappy': (has_snappy, LegacyRecordBatchBuilder.CODEC_SNAPPY),
@@ -344,12 +434,17 @@ class KafkaProducer(object):
    }

    def __init__(self, **configs):
-        log.debug("Starting the Kafka producer")  # trace
        self.config = copy.copy(self.DEFAULT_CONFIG)
+        user_provided_configs = set(configs.keys())
        for key in self.config:
            if key in configs:
                self.config[key] = configs.pop(key)

+        for key in self.DEPRECATED_CONFIGS:
+            if key in configs:
+                configs.pop(key)
+                warnings.warn('Deprecated Producer config: %s' % (key,), DeprecationWarning)
+
        # Only check for extra config keys in top-level class
        assert not configs, 'Unrecognized configs: %s' % (configs,)

@@ -367,30 +462,35 @@ class KafkaProducer(object):
                self.config['api_version'] = None
            else:
                self.config['api_version'] = tuple(map(int, deprecated.split('.')))
-            log.warning('use api_version=%s [tuple] -- "%s" as str is deprecated',
-                        str(self.config['api_version']), deprecated)
+            log.warning('%s: use api_version=%s [tuple] -- "%s" as str is deprecated',
+                        str(self), str(self.config['api_version']), deprecated)
+
+        log.debug("%s: Starting Kafka producer", str(self))

        # Configure metrics
-        metrics_tags = {'client-id': self.config['client_id']}
-        metric_config = MetricConfig(samples=self.config['metrics_num_samples'],
-                                     time_window_ms=self.config['metrics_sample_window_ms'],
-                                     tags=metrics_tags)
-        reporters = [reporter() for reporter in self.config['metric_reporters']]
-        self._metrics = Metrics(metric_config, reporters)
+        if self.config['metrics_enabled']:
+            metrics_tags = {'client-id': self.config['client_id']}
+            metric_config = MetricConfig(samples=self.config['metrics_num_samples'],
+                                         time_window_ms=self.config['metrics_sample_window_ms'],
+                                         tags=metrics_tags)
+            reporters = [reporter() for reporter in self.config['metric_reporters']]
+            self._metrics = Metrics(metric_config, reporters)
+        else:
+            self._metrics = None

-        client = KafkaClient(metrics=self._metrics, metric_group_prefix='producer',
-                             wakeup_timeout_ms=self.config['max_block_ms'],
-                             **self.config)
+        client = self.config['kafka_client'](
+            metrics=self._metrics, metric_group_prefix='producer',
+            wakeup_timeout_ms=self.config['max_block_ms'],
+            **self.config)

-        # Get auto-discovered version from client if necessary
-        if self.config['api_version'] is None:
-            self.config['api_version'] = client.config['api_version']
+        # Get auto-discovered / normalized version from client
+        self.config['api_version'] = client.config['api_version']

        if self.config['compression_type'] == 'lz4':
            assert self.config['api_version'] >= (0, 8, 2), 'LZ4 Requires >= Kafka 0.8.2 Brokers'

        if self.config['compression_type'] == 'zstd':
-            assert self.config['api_version'] >= (2, 1, 0), 'Zstd Requires >= Kafka 2.1.0 Brokers'
+            assert self.config['api_version'] >= (2, 1), 'Zstd Requires >= Kafka 2.1 Brokers'

        # Check compression_type for library support
        ct = self.config['compression_type']
@@ -401,12 +501,58 @@ class KafkaProducer(object):
            assert checker(), "Libraries for {} compression codec not found".format(ct)
            self.config['compression_attrs'] = compression_attrs

-        message_version = self._max_usable_produce_magic()
-        self._accumulator = RecordAccumulator(message_version=message_version, metrics=self._metrics, **self.config)
        self._metadata = client.cluster
+        self._transaction_manager = None
+        self._init_transactions_result = None
+        if 'enable_idempotence' in user_provided_configs and not self.config['enable_idempotence'] and self.config['transactional_id']:
+            raise Errors.KafkaConfigurationError("Cannot set transactional_id without enable_idempotence.")
+
+        if self.config['transactional_id']:
+            self.config['enable_idempotence'] = True
+
+        if self.config['enable_idempotence']:
+            assert self.config['api_version'] >= (0, 11), "Transactional/Idempotent producer requires >= Kafka 0.11 Brokers"
+
+            self._transaction_manager = TransactionManager(
+                transactional_id=self.config['transactional_id'],
+                transaction_timeout_ms=self.config['transaction_timeout_ms'],
+                retry_backoff_ms=self.config['retry_backoff_ms'],
+                api_version=self.config['api_version'],
+                metadata=self._metadata,
+            )
+            if self._transaction_manager.is_transactional():
+                log.info("%s: Instantiated a transactional producer.", str(self))
+            else:
+                log.info("%s: Instantiated an idempotent producer.", str(self))
+
+            if self.config['retries'] == 0:
+                raise Errors.KafkaConfigurationError("Must set 'retries' to non-zero when using the idempotent producer.")
+
+            if 'max_in_flight_requests_per_connection' not in user_provided_configs:
+                log.info("%s: Overriding the default 'max_in_flight_requests_per_connection' to 1 since idempontence is enabled.", str(self))
+                self.config['max_in_flight_requests_per_connection'] = 1
+            elif self.config['max_in_flight_requests_per_connection'] != 1:
+                raise Errors.KafkaConfigurationError("Must set 'max_in_flight_requests_per_connection' to 1 in order"
+                                                     " to use the idempotent producer."
+                                                     " Otherwise we cannot guarantee idempotence.")
+
+            if 'acks' not in user_provided_configs:
+                log.info("%s: Overriding the default 'acks' config to 'all' since idempotence is enabled", str(self))
+                self.config['acks'] = -1
+            elif self.config['acks'] != -1:
+                raise Errors.KafkaConfigurationError("Must set 'acks' config to 'all' in order to use the idempotent"
+                                                     " producer. Otherwise we cannot guarantee idempotence")
+
+        message_version = self.max_usable_produce_magic(self.config['api_version'])
+        self._accumulator = RecordAccumulator(
+                transaction_manager=self._transaction_manager,
+                message_version=message_version,
+                **self.config)
        guarantee_message_order = bool(self.config['max_in_flight_requests_per_connection'] == 1)
        self._sender = Sender(client, self._metadata,
-                              self._accumulator, self._metrics,
+                              self._accumulator,
+                              metrics=self._metrics,
+                              transaction_manager=self._transaction_manager,
                              guarantee_message_order=guarantee_message_order,
                              **self.config)
        self._sender.daemon = True
@@ -415,7 +561,7 @@ class KafkaProducer(object):

        self._cleanup = self._cleanup_factory()
        atexit.register(self._cleanup)
-        log.debug("Kafka producer started")
+        log.debug("%s: Kafka producer started", str(self))

    def bootstrap_connected(self):
        """Return True if the bootstrap is connected."""
@@ -426,7 +572,7 @@ class KafkaProducer(object):
        _self = weakref.proxy(self)
        def wrapper():
            try:
-                _self.close(timeout=0)
+                _self.close(timeout=0, null_logger=True)
            except (ReferenceError, AttributeError):
                pass
        return wrapper
@@ -449,28 +595,28 @@ class KafkaProducer(object):
        self._cleanup = None

    def __del__(self):
-        # Disable logger during destruction to avoid touching dangling references
-        class NullLogger(object):
-            def __getattr__(self, name):
-                return lambda *args: None
+        self.close(timeout=1, null_logger=True)

-        global log
-        log = NullLogger()
-
-        self.close()
-
-    def close(self, timeout=None):
+    def close(self, timeout=None, null_logger=False):
        """Close this producer.

        Arguments:
            timeout (float, optional): timeout in seconds to wait for completion.
        """
+        if null_logger:
+            # Disable logger during destruction to avoid touching dangling references
+            class NullLogger(object):
+                def __getattr__(self, name):
+                    return lambda *args: None
+
+            global log
+            log = NullLogger()

        # drop our atexit handler now to avoid leaks
        self._unregister_cleanup()

        if not hasattr(self, '_closed') or self._closed:
-            log.info('Kafka producer closed')
+            log.info('%s: Kafka producer closed', str(self))
            return
        if timeout is None:
            # threading.TIMEOUT_MAX is available in Python3.3+
@@ -480,15 +626,16 @@ class KafkaProducer(object):
        else:
            assert timeout >= 0

-        log.info("Closing the Kafka producer with %s secs timeout.", timeout)
+        log.info("%s: Closing the Kafka producer with %s secs timeout.", str(self), timeout)
+        self.flush(timeout)
        invoked_from_callback = bool(threading.current_thread() is self._sender)
        if timeout > 0:
            if invoked_from_callback:
-                log.warning("Overriding close timeout %s secs to 0 in order to"
+                log.warning("%s: Overriding close timeout %s secs to 0 in order to"
                            " prevent useless blocking due to self-join. This"
                            " means you have incorrectly invoked close with a"
                            " non-zero timeout from the producer call-back.",
-                            timeout)
+                            str(self), timeout)
            else:
                # Try to close gracefully.
                if self._sender is not None:
@@ -496,12 +643,13 @@ class KafkaProducer(object):
                    self._sender.join(timeout)

        if self._sender is not None and self._sender.is_alive():
-            log.info("Proceeding to force close the producer since pending"
+            log.info("%s: Proceeding to force close the producer since pending"
                     " requests could not be completed within timeout %s.",
-                     timeout)
+                     str(self), timeout)
            self._sender.force_close()

-        self._metrics.close()
+        if self._metrics:
+            self._metrics.close()
        try:
            self.config['key_serializer'].close()
        except AttributeError:
@@ -511,23 +659,23 @@ class KafkaProducer(object):
        except AttributeError:
            pass
        self._closed = True
-        log.debug("The Kafka producer has closed.")
+        log.debug("%s: The Kafka producer has closed.", str(self))

    def partitions_for(self, topic):
        """Returns set of all known partitions for the topic."""
-        max_wait = self.config['max_block_ms'] / 1000.0
-        return self._wait_on_metadata(topic, max_wait)
+        return self._wait_on_metadata(topic, self.config['max_block_ms'])

-    def _max_usable_produce_magic(self):
-        if self.config['api_version'] >= (0, 11):
+    @classmethod
+    def max_usable_produce_magic(cls, api_version):
+        if api_version >= (0, 11):
            return 2
-        elif self.config['api_version'] >= (0, 10):
+        elif api_version >= (0, 10, 0):
            return 1
        else:
            return 0

    def _estimate_size_in_bytes(self, key, value, headers=[]):
-        magic = self._max_usable_produce_magic()
+        magic = self.max_usable_produce_magic(self.config['api_version'])
        if magic == 2:
            return DefaultRecordBatchBuilder.estimate_size_in_bytes(
                key, value, headers)
@@ -535,6 +683,114 @@ class KafkaProducer(object):
            return LegacyRecordBatchBuilder.estimate_size_in_bytes(
                magic, self.config['compression_type'], key, value)

+    def init_transactions(self):
+        """
+        Needs to be called before any other methods when the transactional.id is set in the configuration.
+
+        This method does the following:
+          1. Ensures any transactions initiated by previous instances of the producer with the same
+             transactional_id are completed. If the previous instance had failed with a transaction in
+             progress, it will be aborted. If the last transaction had begun completion,
+             but not yet finished, this method awaits its completion.
+          2. Gets the internal producer id and epoch, used in all future transactional
+             messages issued by the producer.
+
+        Note that this method will raise KafkaTimeoutError if the transactional state cannot
+        be initialized before expiration of `max_block_ms`.
+
+        Retrying after a KafkaTimeoutError will continue to wait for the prior request to succeed or fail.
+        Retrying after any other exception will start a new initialization attempt.
+        Retrying after a successful initialization will do nothing.
+
+        Raises:
+            IllegalStateError: if no transactional_id has been configured
+            AuthorizationError: fatal error indicating that the configured
+                transactional_id is not authorized.
+            KafkaError: if the producer has encountered a previous fatal error or for any other unexpected error
+            KafkaTimeoutError: if the time taken for initialize the transaction has surpassed `max.block.ms`.
+        """
+        if not self._transaction_manager:
+            raise Errors.IllegalStateError("Cannot call init_transactions without setting a transactional_id.")
+        if self._init_transactions_result is None:
+            self._init_transactions_result = self._transaction_manager.initialize_transactions()
+            self._sender.wakeup()
+
+        try:
+            if not self._init_transactions_result.wait(timeout_ms=self.config['max_block_ms']):
+                raise Errors.KafkaTimeoutError("Timeout expired while initializing transactional state in %s ms." % (self.config['max_block_ms'],))
+        finally:
+            if self._init_transactions_result.failed:
+                self._init_transactions_result = None
+
+    def begin_transaction(self):
+        """ Should be called before the start of each new transaction.
+
+        Note that prior to the first invocation of this method,
+        you must invoke `init_transactions()` exactly one time.
+
+        Raises:
+            ProducerFencedError if another producer is with the same
+                transactional_id is active.
+        """
+        # Set the transactional bit in the producer.
+        if not self._transaction_manager:
+            raise Errors.IllegalStateError("Cannot use transactional methods without enabling transactions")
+        self._transaction_manager.begin_transaction()
+
+    def send_offsets_to_transaction(self, offsets, consumer_group_id):
+        """
+        Sends a list of consumed offsets to the consumer group coordinator, and also marks
+        those offsets as part of the current transaction. These offsets will be considered
+        consumed only if the transaction is committed successfully.
+
+        This method should be used when you need to batch consumed and produced messages
+        together, typically in a consume-transform-produce pattern.
+
+        Arguments:
+            offsets ({TopicPartition: OffsetAndMetadata}): map of topic-partition -> offsets to commit
+                as part of current transaction.
+            consumer_group_id (str): Name of consumer group for offsets commit.
+
+        Raises:
+            IllegalStateError: if no transactional_id, or transaction has not been started.
+            ProducerFencedError: fatal error indicating another producer with the same transactional_id is active.
+            UnsupportedVersionError: fatal error indicating the broker does not support transactions (i.e. if < 0.11).
+            UnsupportedForMessageFormatError: fatal error indicating the message format used for the offsets
+                topic on the broker does not support transactions.
+            AuthorizationError: fatal error indicating that the configured transactional_id is not authorized.
+            KafkaErro:r if the producer has encountered a previous fatal or abortable error, or for any
+                other unexpected error
+        """
+        if not self._transaction_manager:
+            raise Errors.IllegalStateError("Cannot use transactional methods without enabling transactions")
+        result = self._transaction_manager.send_offsets_to_transaction(offsets, consumer_group_id)
+        self._sender.wakeup()
+        result.wait()
+
+    def commit_transaction(self):
+        """ Commits the ongoing transaction.
+
+        Raises: ProducerFencedError if another producer with the same
+                transactional_id is active.
+        """
+        if not self._transaction_manager:
+            raise Errors.IllegalStateError("Cannot commit transaction since transactions are not enabled")
+        result = self._transaction_manager.begin_commit()
+        self._sender.wakeup()
+        result.wait()
+
+    def abort_transaction(self):
+        """ Aborts the ongoing transaction.
+
+        Raises: ProducerFencedError if another producer with the same
+                transactional_id is active.
+        """
+        if not self._transaction_manager:
+            raise Errors.IllegalStateError("Cannot abort transaction since transactions are not enabled.")
+        result = self._transaction_manager.begin_abort()
+        self._sender.wakeup()
+        result.wait()
+
    def send(self, topic, value=None, key=None, headers=None, partition=None, timestamp_ms=None):
        """Publish a message to a topic.

@@ -567,44 +823,58 @@ class KafkaProducer(object):
        Raises:
            KafkaTimeoutError: if unable to fetch topic metadata, or unable
                to obtain memory buffer prior to configured max_block_ms
+            TypeError: if topic is not a string
+            ValueError: if topic is invalid: must be chars (a-zA-Z0-9._-), and less than 250 length
+            AssertionError: if KafkaProducer is closed, or key and value are both None
        """
+        assert not self._closed, 'KafkaProducer already closed!'
        assert value is not None or self.config['api_version'] >= (0, 8, 1), (
            'Null messages require kafka >= 0.8.1')
        assert not (value is None and key is None), 'Need at least one: key or value'
+        ensure_valid_topic_name(topic)
        key_bytes = value_bytes = None
+        timer = Timer(self.config['max_block_ms'], "Failed to assign partition for message in max_block_ms.")
        try:
-            self._wait_on_metadata(topic, self.config['max_block_ms'] / 1000.0)
+            assigned_partition = None
+            while assigned_partition is None and not timer.expired:
+                self._wait_on_metadata(topic, timer.timeout_ms)

-            key_bytes = self._serialize(
-                self.config['key_serializer'],
-                topic, key)
-            value_bytes = self._serialize(
-                self.config['value_serializer'],
-                topic, value)
-            assert type(key_bytes) in (bytes, bytearray, memoryview, type(None))
-            assert type(value_bytes) in (bytes, bytearray, memoryview, type(None))
+                key_bytes = self._serialize(
+                    self.config['key_serializer'],
+                    topic, key)
+                value_bytes = self._serialize(
+                    self.config['value_serializer'],
+                    topic, value)
+                assert type(key_bytes) in (bytes, bytearray, memoryview, type(None))
+                assert type(value_bytes) in (bytes, bytearray, memoryview, type(None))

-            partition = self._partition(topic, partition, key, value,
-                                        key_bytes, value_bytes)
+                assigned_partition = self._partition(topic, partition, key, value,
+                                                     key_bytes, value_bytes)
+            if assigned_partition is None:
+                raise Errors.KafkaTimeoutError("Failed to assign partition for message after %s secs." % timer.elapsed_ms / 1000)
+            else:
+                partition = assigned_partition

            if headers is None:
                headers = []
-            assert type(headers) == list
-            assert all(type(item) == tuple and len(item) == 2 and type(item[0]) == str and type(item[1]) == bytes for item in headers)
+            assert isinstance(headers, list)
+            assert all(isinstance(item, tuple) and len(item) == 2 and isinstance(item[0], str) and isinstance(item[1], bytes) for item in headers)

            message_size = self._estimate_size_in_bytes(key_bytes, value_bytes, headers)
            self._ensure_valid_record_size(message_size)

            tp = TopicPartition(topic, partition)
-            log.debug("Sending (key=%r value=%r headers=%r) to %s", key, value, headers, tp)
+            log.debug("%s: Sending (key=%r value=%r headers=%r) to %s", str(self), key, value, headers, tp)
+
+            if self._transaction_manager and self._transaction_manager.is_transactional():
+                self._transaction_manager.maybe_add_partition_to_transaction(tp)
+
            result = self._accumulator.append(tp, timestamp_ms,
-                                              key_bytes, value_bytes, headers,
-                                              self.config['max_block_ms'],
-                                              estimated_size=message_size)
+                                              key_bytes, value_bytes, headers)
            future, batch_is_full, new_batch_created = result
            if batch_is_full or new_batch_created:
-                log.debug("Waking up the sender since %s is either full or"
-                          " getting a new batch", tp)
+                log.debug("%s: Waking up the sender since %s is either full or"
+                          " getting a new batch", str(self), tp)
                self._sender.wakeup()

            return future
@@ -612,7 +882,7 @@ class KafkaProducer(object):
            # for API exceptions return them in the future,
            # for other exceptions raise directly
        except Errors.BrokerResponseError as e:
-            log.debug("Exception occurred during message send: %s", e)
+            log.error("%s: Exception occurred during message send: %s", str(self), e)
            return FutureRecordMetadata(
                FutureProduceResult(TopicPartition(topic, partition)),
                -1, None, None,
@@ -643,7 +913,7 @@ class KafkaProducer(object):
            KafkaTimeoutError: failure to flush buffered records within the
                provided timeout
        """
-        log.debug("Flushing accumulated records in producer.")  # trace
+        log.debug("%s: Flushing accumulated records in producer.", str(self))
        self._accumulator.begin_flush()
        self._sender.wakeup()
        self._accumulator.await_flush_completion(timeout=timeout)
@@ -655,13 +925,8 @@ class KafkaProducer(object):
                "The message is %d bytes when serialized which is larger than"
                " the maximum request size you have configured with the"
                " max_request_size configuration" % (size,))
-        if size > self.config['buffer_memory']:
-            raise Errors.MessageSizeTooLargeError(
-                "The message is %d bytes when serialized which is larger than"
-                " the total memory buffer you have configured with the"
-                " buffer_memory configuration." % (size,))

-    def _wait_on_metadata(self, topic, max_wait):
+    def _wait_on_metadata(self, topic, max_wait_ms):
        """
        Wait for cluster metadata including partitions for the given topic to
        be available.
@@ -679,32 +944,31 @@ class KafkaProducer(object):
        """
        # add topic to metadata topic list if it is not there already.
        self._sender.add_topic(topic)
-        begin = time.time()
-        elapsed = 0.0
+        timer = Timer(max_wait_ms, "Failed to update metadata after %.1f secs." % (max_wait_ms / 1000,))
        metadata_event = None
        while True:
            partitions = self._metadata.partitions_for_topic(topic)
            if partitions is not None:
                return partitions
-
+            timer.maybe_raise()
            if not metadata_event:
                metadata_event = threading.Event()

-            log.debug("Requesting metadata update for topic %s", topic)
-
+            log.debug("%s: Requesting metadata update for topic %s", str(self), topic)
            metadata_event.clear()
            future = self._metadata.request_update()
            future.add_both(lambda e, *args: e.set(), metadata_event)
            self._sender.wakeup()
-            metadata_event.wait(max_wait - elapsed)
-            elapsed = time.time() - begin
-            if not metadata_event.is_set():
+            metadata_event.wait(timer.timeout_ms / 1000)
+            if not future.is_done:
                raise Errors.KafkaTimeoutError(
-                    "Failed to update metadata after %.1f secs." % (max_wait,))
+                    "Failed to update metadata after %.1f secs." % (max_wait_ms / 1000,))
+            elif future.failed() and not future.retriable():
+                raise future.exception
            elif topic in self._metadata.unauthorized_topics:
-                raise Errors.TopicAuthorizationFailedError(topic)
+                raise Errors.TopicAuthorizationFailedError(set([topic]))
            else:
-                log.debug("_wait_on_metadata woke after %s secs.", elapsed)
+                log.debug("%s: _wait_on_metadata woke after %s secs.", str(self), timer.elapsed_ms / 1000)

    def _serialize(self, f, topic, data):
        if not f:
@@ -715,16 +979,18 @@ class KafkaProducer(object):

    def _partition(self, topic, partition, key, value,
                   serialized_key, serialized_value):
+        all_partitions = self._metadata.partitions_for_topic(topic)
+        available = self._metadata.available_partitions_for_topic(topic)
+        if all_partitions is None or available is None:
+            return None
        if partition is not None:
            assert partition >= 0
-            assert partition in self._metadata.partitions_for_topic(topic), 'Unrecognized partition'
+            assert partition in all_partitions, 'Unrecognized partition'
            return partition

-        all_partitions = sorted(self._metadata.partitions_for_topic(topic))
-        available = list(self._metadata.available_partitions_for_topic(topic))
        return self.config['partitioner'](serialized_key,
-                                          all_partitions,
-                                          available)
+                                          sorted(all_partitions),
+                                          list(available))

    def metrics(self, raw=False):
        """Get metrics on producer performance.
@@ -736,6 +1002,8 @@ class KafkaProducer(object):
            This is an unstable interface. It may change in future
            releases without warning.
        """
+        if not self._metrics:
+            return
        if raw:
            return self._metrics.metrics.copy()

@@ -747,3 +1015,6 @@ class KafkaProducer(object):
                metrics[k.group][k.name] = {}
            metrics[k.group][k.name] = v.value()
        return metrics
+
+    def __str__(self):
+        return "<KafkaProducer client_id=%s transactional_id=%s>" % (self.config['client_id'], self.config['transactional_id'])
--- a/venv/lib/python3.12/site-packages/kafka/producer/record_accumulator.py
+++ b/venv/lib/python3.12/site-packages/kafka/producer/record_accumulator.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import
+from __future__ import absolute_import, division

 import collections
 import copy
@@ -6,8 +6,14 @@ import logging
 import threading
 import time

+try:
+    # enum in stdlib as of py3.4
+    from enum import IntEnum  # pylint: disable=import-error
+except ImportError:
+    # vendored backport module
+    from kafka.vendor.enum34 import IntEnum
+
 import kafka.errors as Errors
-from kafka.producer.buffer import SimpleBufferPool
 from kafka.producer.future import FutureRecordMetadata, FutureProduceResult
 from kafka.record.memory_records import MemoryRecordsBuilder
 from kafka.structs import TopicPartition
@@ -35,10 +41,16 @@ class AtomicInteger(object):
        return self._val


+class FinalState(IntEnum):
+    ABORTED = 0
+    FAILED = 1
+    SUCCEEDED = 2
+
+
 class ProducerBatch(object):
-    def __init__(self, tp, records, buffer):
+    def __init__(self, tp, records, now=None):
+        now = time.time() if now is None else now
        self.max_record_size = 0
-        now = time.time()
        self.created = now
        self.drained = None
        self.attempts = 0
@@ -48,81 +60,120 @@ class ProducerBatch(object):
        self.topic_partition = tp
        self.produce_future = FutureProduceResult(tp)
        self._retry = False
-        self._buffer = buffer  # We only save it, we don't write to it
+        self._final_state = None
+
+    @property
+    def final_state(self):
+        return self._final_state

    @property
    def record_count(self):
        return self.records.next_offset()

-    def try_append(self, timestamp_ms, key, value, headers):
+    @property
+    def producer_id(self):
+        return self.records.producer_id if self.records else None
+
+    @property
+    def producer_epoch(self):
+        return self.records.producer_epoch if self.records else None
+
+    @property
+    def has_sequence(self):
+        return self.records.has_sequence if self.records else False
+
+    def try_append(self, timestamp_ms, key, value, headers, now=None):
        metadata = self.records.append(timestamp_ms, key, value, headers)
        if metadata is None:
            return None

+        now = time.time() if now is None else now
        self.max_record_size = max(self.max_record_size, metadata.size)
-        self.last_append = time.time()
-        future = FutureRecordMetadata(self.produce_future, metadata.offset,
-                                      metadata.timestamp, metadata.crc,
-                                      len(key) if key is not None else -1,
-                                      len(value) if value is not None else -1,
-                                      sum(len(h_key.encode("utf-8")) + len(h_val) for h_key, h_val in headers) if headers else -1)
+        self.last_append = now
+        future = FutureRecordMetadata(
+            self.produce_future,
+            metadata.offset,
+            metadata.timestamp,
+            metadata.crc,
+            len(key) if key is not None else -1,
+            len(value) if value is not None else -1,
+            sum(len(h_key.encode("utf-8")) + len(h_val) for h_key, h_val in headers) if headers else -1)
        return future

-    def done(self, base_offset=None, timestamp_ms=None, exception=None, log_start_offset=None, global_error=None):
-        level = logging.DEBUG if exception is None else logging.WARNING
-        log.log(level, "Produced messages to topic-partition %s with base offset"
-                  " %s log start offset %s and error %s.", self.topic_partition, base_offset,
-                  log_start_offset, global_error)  # trace
+    def abort(self, exception):
+        """Abort the batch and complete the future and callbacks."""
+        if self._final_state is not None:
+            raise Errors.IllegalStateError("Batch has already been completed in final state: %s" % self._final_state)
+        self._final_state = FinalState.ABORTED
+
+        log.debug("Aborting batch for partition %s: %s", self.topic_partition, exception)
+        self._complete_future(-1, -1, exception)
+
+    def done(self, base_offset=None, timestamp_ms=None, exception=None):
+        """
+        Finalize the state of a batch. Final state, once set, is immutable. This function may be called
+        once or twice on a batch. It may be called twice if
+            1. An inflight batch expires before a response from the broker is received. The batch's final
+            state is set to FAILED. But it could succeed on the broker and second time around batch.done() may
+            try to set SUCCEEDED final state.
+
+            2. If a transaction abortion happens or if the producer is closed forcefully, the final state is
+            ABORTED but again it could succeed if broker responds with a success.
+
+        Attempted transitions from [FAILED | ABORTED] --> SUCCEEDED are logged.
+        Attempted transitions from one failure state to the same or a different failed state are ignored.
+        Attempted transitions from SUCCEEDED to the same or a failed state throw an exception.
+        """
+        final_state = FinalState.SUCCEEDED if exception is None else FinalState.FAILED
+        if self._final_state is None:
+            self._final_state = final_state
+            if final_state is FinalState.SUCCEEDED:
+                log.debug("Successfully produced messages to %s with base offset %s", self.topic_partition, base_offset)
+            else:
+                log.warning("Failed to produce messages to topic-partition %s with base offset %s: %s",
+                            self.topic_partition, base_offset, exception)
+            self._complete_future(base_offset, timestamp_ms, exception)
+            return True
+
+        elif self._final_state is not FinalState.SUCCEEDED:
+            if final_state is FinalState.SUCCEEDED:
+                # Log if a previously unsuccessful batch succeeded later on.
+                log.debug("ProduceResponse returned %s for %s after batch with base offset %s had already been %s.",
+                          final_state, self.topic_partition, base_offset, self._final_state)
+            else:
+                # FAILED --> FAILED and ABORTED --> FAILED transitions are ignored.
+                log.debug("Ignored state transition %s -> %s for %s batch with base offset %s",
+                          self._final_state, final_state, self.topic_partition, base_offset)
+        else:
+            # A SUCCESSFUL batch must not attempt another state change.
+            raise Errors.IllegalStateError("A %s batch must not attempt another state change to %s" % (self._final_state, final_state))
+        return False
+
+    def _complete_future(self, base_offset, timestamp_ms, exception):
        if self.produce_future.is_done:
-            log.warning('Batch is already closed -- ignoring batch.done()')
-            return
+            raise Errors.IllegalStateError('Batch is already closed!')
        elif exception is None:
-            self.produce_future.success((base_offset, timestamp_ms, log_start_offset))
+            self.produce_future.success((base_offset, timestamp_ms))
        else:
            self.produce_future.failure(exception)

-    def maybe_expire(self, request_timeout_ms, retry_backoff_ms, linger_ms, is_full):
-        """Expire batches if metadata is not available
-
-        A batch whose metadata is not available should be expired if one
-        of the following is true:
-
-          * the batch is not in retry AND request timeout has elapsed after
-            it is ready (full or linger.ms has reached).
-
-          * the batch is in retry AND request timeout has elapsed after the
-            backoff period ended.
-        """
-        now = time.time()
-        since_append = now - self.last_append
-        since_ready = now - (self.created + linger_ms / 1000.0)
-        since_backoff = now - (self.last_attempt + retry_backoff_ms / 1000.0)
-        timeout = request_timeout_ms / 1000.0
-
-        error = None
-        if not self.in_retry() and is_full and timeout < since_append:
-            error = "%d seconds have passed since last append" % (since_append,)
-        elif not self.in_retry() and timeout < since_ready:
-            error = "%d seconds have passed since batch creation plus linger time" % (since_ready,)
-        elif self.in_retry() and timeout < since_backoff:
-            error = "%d seconds have passed since last attempt plus backoff time" % (since_backoff,)
-
-        if error:
-            self.records.close()
-            self.done(-1, None, Errors.KafkaTimeoutError(
-                "Batch for %s containing %s record(s) expired: %s" % (
-                self.topic_partition, self.records.next_offset(), error)))
-            return True
-        return False
+    def has_reached_delivery_timeout(self, delivery_timeout_ms, now=None):
+        now = time.time() if now is None else now
+        return delivery_timeout_ms / 1000 <= now - self.created

    def in_retry(self):
        return self._retry

-    def set_retry(self):
+    def retry(self, now=None):
+        now = time.time() if now is None else now
        self._retry = True
+        self.attempts += 1
+        self.last_attempt = now
+        self.last_append = now

-    def buffer(self):
-        return self._buffer
+    @property
+    def is_done(self):
+        return self.produce_future.is_done

    def __str__(self):
        return 'ProducerBatch(topic_partition=%s, record_count=%d)' % (
@@ -143,12 +194,6 @@ class RecordAccumulator(object):
            A small batch size will make batching less common and may reduce
            throughput (a batch size of zero will disable batching entirely).
            Default: 16384
-        buffer_memory (int): The total bytes of memory the producer should use
-            to buffer records waiting to be sent to the server. If records are
-            sent faster than they can be delivered to the server the producer
-            will block up to max_block_ms, raising an exception on timeout.
-            In the current implementation, this setting is an approximation.
-            Default: 33554432 (32MB)
        compression_attrs (int): The compression type for all data generated by
            the producer. Valid values are gzip(1), snappy(2), lz4(3), or
            none(0).
@@ -156,7 +201,7 @@ class RecordAccumulator(object):
            will also impact the compression ratio (more batching means better
            compression). Default: None.
        linger_ms (int): An artificial delay time to add before declaring a
-            messageset (that isn't full) ready for sending. This allows
+            record batch (that isn't full) ready for sending. This allows
            time for more records to arrive. Setting a non-zero linger_ms
            will trade off some latency for potentially better throughput
            due to more batching (and hence fewer, larger requests).
@@ -166,14 +211,14 @@ class RecordAccumulator(object):
            all retries in a short period of time. Default: 100
    """
    DEFAULT_CONFIG = {
-        'buffer_memory': 33554432,
        'batch_size': 16384,
        'compression_attrs': 0,
        'linger_ms': 0,
+        'request_timeout_ms': 30000,
+        'delivery_timeout_ms': 120000,
        'retry_backoff_ms': 100,
-        'message_version': 0,
-        'metrics': None,
-        'metric_group_prefix': 'producer-metrics',
+        'transaction_manager': None,
+        'message_version': 2,
    }

    def __init__(self, **configs):
@@ -183,22 +228,37 @@ class RecordAccumulator(object):
                self.config[key] = configs.pop(key)

        self._closed = False
+        self._transaction_manager = self.config['transaction_manager']
        self._flushes_in_progress = AtomicInteger()
        self._appends_in_progress = AtomicInteger()
        self._batches = collections.defaultdict(collections.deque) # TopicPartition: [ProducerBatch]
        self._tp_locks = {None: threading.Lock()} # TopicPartition: Lock, plus a lock to add entries
-        self._free = SimpleBufferPool(self.config['buffer_memory'],
-                                      self.config['batch_size'],
-                                      metrics=self.config['metrics'],
-                                      metric_group_prefix=self.config['metric_group_prefix'])
        self._incomplete = IncompleteProducerBatches()
        # The following variables should only be accessed by the sender thread,
        # so we don't need to protect them w/ locking.
        self.muted = set()
        self._drain_index = 0
+        self._next_batch_expiry_time_ms = float('inf')

-    def append(self, tp, timestamp_ms, key, value, headers, max_time_to_block_ms,
-               estimated_size=0):
+        if self.config['delivery_timeout_ms'] < self.config['linger_ms'] + self.config['request_timeout_ms']:
+            raise Errors.KafkaConfigurationError("Must set delivery_timeout_ms higher than linger_ms + request_timeout_ms")
+
+    @property
+    def delivery_timeout_ms(self):
+        return self.config['delivery_timeout_ms']
+
+    @property
+    def next_expiry_time_ms(self):
+        return self._next_batch_expiry_time_ms
+
+    def _tp_lock(self, tp):
+        if tp not in self._tp_locks:
+            with self._tp_locks[None]:
+                if tp not in self._tp_locks:
+                    self._tp_locks[tp] = threading.Lock()
+        return self._tp_locks[tp]
+
+    def append(self, tp, timestamp_ms, key, value, headers, now=None):
        """Add a record to the accumulator, return the append result.

        The append result will contain the future metadata, and flag for
@@ -211,59 +271,53 @@ class RecordAccumulator(object):
            key (bytes): The key for the record
            value (bytes): The value for the record
            headers (List[Tuple[str, bytes]]): The header fields for the record
-            max_time_to_block_ms (int): The maximum time in milliseconds to
-                block for buffer memory to be available

        Returns:
            tuple: (future, batch_is_full, new_batch_created)
        """
        assert isinstance(tp, TopicPartition), 'not TopicPartition'
        assert not self._closed, 'RecordAccumulator is closed'
+        now = time.time() if now is None else now
        # We keep track of the number of appending thread to make sure we do
        # not miss batches in abortIncompleteBatches().
        self._appends_in_progress.increment()
        try:
-            if tp not in self._tp_locks:
-                with self._tp_locks[None]:
-                    if tp not in self._tp_locks:
-                        self._tp_locks[tp] = threading.Lock()
-
-            with self._tp_locks[tp]:
+            with self._tp_lock(tp):
                # check if we have an in-progress batch
                dq = self._batches[tp]
                if dq:
                    last = dq[-1]
-                    future = last.try_append(timestamp_ms, key, value, headers)
+                    future = last.try_append(timestamp_ms, key, value, headers, now=now)
                    if future is not None:
                        batch_is_full = len(dq) > 1 or last.records.is_full()
                        return future, batch_is_full, False

-            size = max(self.config['batch_size'], estimated_size)
-            log.debug("Allocating a new %d byte message buffer for %s", size, tp) # trace
-            buf = self._free.allocate(size, max_time_to_block_ms)
-            with self._tp_locks[tp]:
+            with self._tp_lock(tp):
                # Need to check if producer is closed again after grabbing the
                # dequeue lock.
                assert not self._closed, 'RecordAccumulator is closed'

                if dq:
                    last = dq[-1]
-                    future = last.try_append(timestamp_ms, key, value, headers)
+                    future = last.try_append(timestamp_ms, key, value, headers, now=now)
                    if future is not None:
                        # Somebody else found us a batch, return the one we
                        # waited for! Hopefully this doesn't happen often...
-                        self._free.deallocate(buf)
                        batch_is_full = len(dq) > 1 or last.records.is_full()
                        return future, batch_is_full, False

+                if self._transaction_manager and self.config['message_version'] < 2:
+                    raise Errors.UnsupportedVersionError("Attempting to use idempotence with a broker which"
+                                                         " does not support the required message format (v2)."
+                                                         " The broker must be version 0.11 or later.")
                records = MemoryRecordsBuilder(
                    self.config['message_version'],
                    self.config['compression_attrs'],
                    self.config['batch_size']
                )

-                batch = ProducerBatch(tp, records, buf)
-                future = batch.try_append(timestamp_ms, key, value, headers)
+                batch = ProducerBatch(tp, records, now=now)
+                future = batch.try_append(timestamp_ms, key, value, headers, now=now)
                if not future:
                    raise Exception()

@@ -274,79 +328,43 @@ class RecordAccumulator(object):
        finally:
            self._appends_in_progress.decrement()

-    def abort_expired_batches(self, request_timeout_ms, cluster):
-        """Abort the batches that have been sitting in RecordAccumulator for
-        more than the configured request_timeout due to metadata being
-        unavailable.
+    def reset_next_batch_expiry_time(self):
+        self._next_batch_expiry_time_ms = float('inf')

-        Arguments:
-            request_timeout_ms (int): milliseconds to timeout
-            cluster (ClusterMetadata): current metadata for kafka cluster
+    def maybe_update_next_batch_expiry_time(self, batch):
+        self._next_batch_expiry_time_ms = min(self._next_batch_expiry_time_ms, batch.created * 1000 + self.delivery_timeout_ms)

-        Returns:
-            list of ProducerBatch that were expired
-        """
+    def expired_batches(self, now=None):
+        """Get a list of batches which have been sitting in the accumulator too long and need to be expired."""
        expired_batches = []
-        to_remove = []
-        count = 0
        for tp in list(self._batches.keys()):
-            assert tp in self._tp_locks, 'TopicPartition not in locks dict'
-
-            # We only check if the batch should be expired if the partition
-            # does not have a batch in flight. This is to avoid the later
-            # batches get expired when an earlier batch is still in progress.
-            # This protection only takes effect when user sets
-            # max.in.flight.request.per.connection=1. Otherwise the expiration
-            # order is not guranteed.
-            if tp in self.muted:
-                continue
-
-            with self._tp_locks[tp]:
+            with self._tp_lock(tp):
                # iterate over the batches and expire them if they have stayed
                # in accumulator for more than request_timeout_ms
                dq = self._batches[tp]
-                for batch in dq:
-                    is_full = bool(bool(batch != dq[-1]) or batch.records.is_full())
-                    # check if the batch is expired
-                    if batch.maybe_expire(request_timeout_ms,
-                                          self.config['retry_backoff_ms'],
-                                          self.config['linger_ms'],
-                                          is_full):
+                while dq:
+                    batch = dq[0]
+                    if batch.has_reached_delivery_timeout(self.delivery_timeout_ms, now=now):
+                        dq.popleft()
+                        batch.records.close()
                        expired_batches.append(batch)
-                        to_remove.append(batch)
-                        count += 1
-                        self.deallocate(batch)
                    else:
                        # Stop at the first batch that has not expired.
+                        self.maybe_update_next_batch_expiry_time(batch)
                        break
-
-                # Python does not allow us to mutate the dq during iteration
-                # Assuming expired batches are infrequent, this is better than
-                # creating a new copy of the deque for iteration on every loop
-                if to_remove:
-                    for batch in to_remove:
-                        dq.remove(batch)
-                    to_remove = []
-
-        if expired_batches:
-            log.warning("Expired %d batches in accumulator", count) # trace
-
        return expired_batches

-    def reenqueue(self, batch):
-        """Re-enqueue the given record batch in the accumulator to retry."""
-        now = time.time()
-        batch.attempts += 1
-        batch.last_attempt = now
-        batch.last_append = now
-        batch.set_retry()
-        assert batch.topic_partition in self._tp_locks, 'TopicPartition not in locks dict'
-        assert batch.topic_partition in self._batches, 'TopicPartition not in batches'
-        dq = self._batches[batch.topic_partition]
-        with self._tp_locks[batch.topic_partition]:
+    def reenqueue(self, batch, now=None):
+        """
+        Re-enqueue the given record batch in the accumulator. In Sender._complete_batch method, we check
+        whether the batch has reached delivery_timeout_ms or not. Hence we do not do the delivery timeout check here.
+        """
+        batch.retry(now=now)
+        with self._tp_lock(batch.topic_partition):
+            dq = self._batches[batch.topic_partition]
            dq.appendleft(batch)

-    def ready(self, cluster):
+    def ready(self, cluster, now=None):
        """
        Get a list of nodes whose partitions are ready to be sent, and the
        earliest time at which any non-sendable partition will be ready;
@@ -380,9 +398,8 @@ class RecordAccumulator(object):
        ready_nodes = set()
        next_ready_check = 9999999.99
        unknown_leaders_exist = False
-        now = time.time()
+        now = time.time() if now is None else now

-        exhausted = bool(self._free.queued() > 0)
        # several threads are accessing self._batches -- to simplify
        # concurrent access, we iterate over a snapshot of partitions
        # and lock each partition separately as needed
@@ -397,23 +414,23 @@ class RecordAccumulator(object):
            elif tp in self.muted:
                continue

-            with self._tp_locks[tp]:
+            with self._tp_lock(tp):
                dq = self._batches[tp]
                if not dq:
                    continue
                batch = dq[0]
-                retry_backoff = self.config['retry_backoff_ms'] / 1000.0
-                linger = self.config['linger_ms'] / 1000.0
-                backing_off = bool(batch.attempts > 0 and
-                                   batch.last_attempt + retry_backoff > now)
+                retry_backoff = self.config['retry_backoff_ms'] / 1000
+                linger = self.config['linger_ms'] / 1000
+                backing_off = bool(batch.attempts > 0
+                                   and (batch.last_attempt + retry_backoff) > now)
                waited_time = now - batch.last_attempt
                time_to_wait = retry_backoff if backing_off else linger
                time_left = max(time_to_wait - waited_time, 0)
                full = bool(len(dq) > 1 or batch.records.is_full())
                expired = bool(waited_time >= time_to_wait)

-                sendable = (full or expired or exhausted or self._closed or
-                            self._flush_in_progress())
+                sendable = (full or expired or self._closed or
+                            self.flush_in_progress())

                if sendable and not backing_off:
                    ready_nodes.add(leader)
@@ -427,16 +444,98 @@ class RecordAccumulator(object):

        return ready_nodes, next_ready_check, unknown_leaders_exist

-    def has_unsent(self):
-        """Return whether there is any unsent record in the accumulator."""
+    def has_undrained(self):
+        """Check whether there are any batches which haven't been drained"""
        for tp in list(self._batches.keys()):
-            with self._tp_locks[tp]:
+            with self._tp_lock(tp):
                dq = self._batches[tp]
                if len(dq):
                    return True
        return False

-    def drain(self, cluster, nodes, max_size):
+    def _should_stop_drain_batches_for_partition(self, first, tp):
+        if self._transaction_manager:
+            if not self._transaction_manager.is_send_to_partition_allowed(tp):
+                return True
+            if not self._transaction_manager.producer_id_and_epoch.is_valid:
+                # we cannot send the batch until we have refreshed the PID
+                log.debug("Waiting to send ready batches because transaction producer id is not valid")
+                return True
+        return False
+
+    def drain_batches_for_one_node(self, cluster, node_id, max_size, now=None):
+        now = time.time() if now is None else now
+        size = 0
+        ready = []
+        partitions = list(cluster.partitions_for_broker(node_id))
+        if not partitions:
+            return ready
+        # to make starvation less likely this loop doesn't start at 0
+        self._drain_index %= len(partitions)
+        start = None
+        while start != self._drain_index:
+            tp = partitions[self._drain_index]
+            if start is None:
+                start = self._drain_index
+            self._drain_index += 1
+            self._drain_index %= len(partitions)
+
+            # Only proceed if the partition has no in-flight batches.
+            if tp in self.muted:
+                continue
+
+            if tp not in self._batches:
+                continue
+
+            with self._tp_lock(tp):
+                dq = self._batches[tp]
+                if len(dq) == 0:
+                    continue
+                first = dq[0]
+                backoff = bool(first.attempts > 0 and
+                               first.last_attempt + self.config['retry_backoff_ms'] / 1000 > now)
+                # Only drain the batch if it is not during backoff
+                if backoff:
+                    continue
+
+                if (size + first.records.size_in_bytes() > max_size
+                    and len(ready) > 0):
+                    # there is a rare case that a single batch
+                    # size is larger than the request size due
+                    # to compression; in this case we will
+                    # still eventually send this batch in a
+                    # single request
+                    break
+                else:
+                    if self._should_stop_drain_batches_for_partition(first, tp):
+                        break
+
+                    batch = dq.popleft()
+                    if self._transaction_manager and not batch.in_retry():
+                        # If the batch is in retry, then we should not change the pid and
+                        # sequence number, since this may introduce duplicates. In particular,
+                        # the previous attempt may actually have been accepted, and if we change
+                        # the pid and sequence here, this attempt will also be accepted, causing
+                        # a duplicate.
+                        sequence_number = self._transaction_manager.sequence_number(batch.topic_partition)
+                        log.debug("Dest: %s: %s producer_id=%s epoch=%s sequence=%s",
+                                  node_id, batch.topic_partition,
+                                  self._transaction_manager.producer_id_and_epoch.producer_id,
+                                  self._transaction_manager.producer_id_and_epoch.epoch,
+                                  sequence_number)
+                        batch.records.set_producer_state(
+                            self._transaction_manager.producer_id_and_epoch.producer_id,
+                            self._transaction_manager.producer_id_and_epoch.epoch,
+                            sequence_number,
+                            self._transaction_manager.is_transactional()
+                        )
+                    batch.records.close()
+                    size += batch.records.size_in_bytes()
+                    ready.append(batch)
+                    batch.drained = now
+        return ready
+
+    def drain(self, cluster, nodes, max_size, now=None):
        """
        Drain all the data for the given nodes and collate them into a list of
        batches that will fit within the specified size on a per-node basis.
@@ -454,59 +553,17 @@ class RecordAccumulator(object):
        if not nodes:
            return {}

-        now = time.time()
+        now = time.time() if now is None else now
        batches = {}
        for node_id in nodes:
-            size = 0
-            partitions = list(cluster.partitions_for_broker(node_id))
-            ready = []
-            # to make starvation less likely this loop doesn't start at 0
-            self._drain_index %= len(partitions)
-            start = self._drain_index
-            while True:
-                tp = partitions[self._drain_index]
-                if tp in self._batches and tp not in self.muted:
-                    with self._tp_locks[tp]:
-                        dq = self._batches[tp]
-                        if dq:
-                            first = dq[0]
-                            backoff = (
-                                bool(first.attempts > 0) and
-                                bool(first.last_attempt +
-                                     self.config['retry_backoff_ms'] / 1000.0
-                                     > now)
-                            )
-                            # Only drain the batch if it is not during backoff
-                            if not backoff:
-                                if (size + first.records.size_in_bytes() > max_size
-                                    and len(ready) > 0):
-                                    # there is a rare case that a single batch
-                                    # size is larger than the request size due
-                                    # to compression; in this case we will
-                                    # still eventually send this batch in a
-                                    # single request
-                                    break
-                                else:
-                                    batch = dq.popleft()
-                                    batch.records.close()
-                                    size += batch.records.size_in_bytes()
-                                    ready.append(batch)
-                                    batch.drained = now
-
-                self._drain_index += 1
-                self._drain_index %= len(partitions)
-                if start == self._drain_index:
-                    break
-
-            batches[node_id] = ready
+            batches[node_id] = self.drain_batches_for_one_node(cluster, node_id, max_size, now=now)
        return batches

    def deallocate(self, batch):
        """Deallocate the record batch."""
        self._incomplete.remove(batch)
-        self._free.deallocate(batch.buffer())

-    def _flush_in_progress(self):
+    def flush_in_progress(self):
        """Are there any threads currently waiting on a flush?"""
        return self._flushes_in_progress.get() > 0

@@ -535,6 +592,10 @@ class RecordAccumulator(object):
        finally:
            self._flushes_in_progress.decrement()

+    @property
+    def has_incomplete(self):
+        return bool(self._incomplete)
+
    def abort_incomplete_batches(self):
        """
        This function is only called when sender is closed forcefully. It will fail all the
@@ -544,27 +605,41 @@ class RecordAccumulator(object):
        # 1. Avoid losing batches.
        # 2. Free up memory in case appending threads are blocked on buffer full.
        # This is a tight loop but should be able to get through very quickly.
+        error = Errors.IllegalStateError("Producer is closed forcefully.")
        while True:
-            self._abort_batches()
+            self._abort_batches(error)
            if not self._appends_in_progress.get():
                break
        # After this point, no thread will append any messages because they will see the close
        # flag set. We need to do the last abort after no thread was appending in case the there was a new
        # batch appended by the last appending thread.
-        self._abort_batches()
+        self._abort_batches(error)
        self._batches.clear()

-    def _abort_batches(self):
+    def _abort_batches(self, error):
        """Go through incomplete batches and abort them."""
-        error = Errors.IllegalStateError("Producer is closed forcefully.")
        for batch in self._incomplete.all():
            tp = batch.topic_partition
            # Close the batch before aborting
-            with self._tp_locks[tp]:
+            with self._tp_lock(tp):
                batch.records.close()
-            batch.done(exception=error)
+                self._batches[tp].remove(batch)
+            batch.abort(error)
            self.deallocate(batch)

+    def abort_undrained_batches(self, error):
+        for batch in self._incomplete.all():
+            tp = batch.topic_partition
+            with self._tp_lock(tp):
+                aborted = False
+                if not batch.is_done:
+                    aborted = True
+                    batch.records.close()
+                    self._batches[tp].remove(batch)
+            if aborted:
+                batch.abort(error)
+                self.deallocate(batch)
+
    def close(self):
        """Close this accumulator and force all the record buffers to be drained."""
        self._closed = True
@@ -579,12 +654,21 @@ class IncompleteProducerBatches(object):

    def add(self, batch):
        with self._lock:
-            return self._incomplete.add(batch)
+            self._incomplete.add(batch)

    def remove(self, batch):
        with self._lock:
-            return self._incomplete.remove(batch)
+            try:
+                self._incomplete.remove(batch)
+            except KeyError:
+                pass

    def all(self):
        with self._lock:
            return list(self._incomplete)
+
+    def __bool__(self):
+        return bool(self._incomplete)
+
+
+    __nonzero__ = __bool__
--- a/venv/lib/python3.12/site-packages/kafka/producer/sender.py
+++ b/venv/lib/python3.12/site-packages/kafka/producer/sender.py
@@ -2,6 +2,7 @@ from __future__ import absolute_import, division

 import collections
 import copy
+import heapq
 import logging
 import threading
 import time
@@ -11,6 +12,8 @@ from kafka.vendor import six
 from kafka import errors as Errors
 from kafka.metrics.measurable import AnonMeasurable
 from kafka.metrics.stats import Avg, Max, Rate
+from kafka.producer.transaction_manager import ProducerIdAndEpoch
+from kafka.protocol.init_producer_id import InitProducerIdRequest
 from kafka.protocol.produce import ProduceRequest
 from kafka.structs import TopicPartition
 from kafka.version import __version__
@@ -27,14 +30,18 @@ class Sender(threading.Thread):
    DEFAULT_CONFIG = {
        'max_request_size': 1048576,
        'acks': 1,
-        'retries': 0,
+        'retries': float('inf'),
        'request_timeout_ms': 30000,
+        'retry_backoff_ms': 100,
+        'metrics': None,
        'guarantee_message_order': False,
+        'transaction_manager': None,
+        'transactional_id': None,
+        'transaction_timeout_ms': 60000,
        'client_id': 'kafka-python-' + __version__,
-        'api_version': (0, 8, 0),
    }

-    def __init__(self, client, metadata, accumulator, metrics, **configs):
+    def __init__(self, client, metadata, accumulator, **configs):
        super(Sender, self).__init__()
        self.config = copy.copy(self.DEFAULT_CONFIG)
        for key in self.config:
@@ -48,32 +55,75 @@ class Sender(threading.Thread):
        self._running = True
        self._force_close = False
        self._topics_to_add = set()
-        self._sensors = SenderMetrics(metrics, self._client, self._metadata)
+        if self.config['metrics']:
+            self._sensors = SenderMetrics(self.config['metrics'], self._client, self._metadata)
+        else:
+            self._sensors = None
+        self._transaction_manager = self.config['transaction_manager']
+        # A per-partition queue of batches ordered by creation time for tracking the in-flight batches
+        self._in_flight_batches = collections.defaultdict(list)
+
+    def _maybe_remove_from_inflight_batches(self, batch):
+        try:
+            queue = self._in_flight_batches[batch.topic_partition]
+        except KeyError:
+            return
+        try:
+            idx = queue.index((batch.created, batch))
+        except ValueError:
+            return
+        # https://stackoverflow.com/questions/10162679/python-delete-element-from-heap
+        queue[idx] = queue[-1]
+        queue.pop()
+        heapq.heapify(queue)
+
+    def _get_expired_inflight_batches(self, now=None):
+        """Get the in-flight batches that has reached delivery timeout."""
+        expired_batches = []
+        to_remove = []
+        for tp, queue in six.iteritems(self._in_flight_batches):
+            while queue:
+                _created_at, batch = queue[0]
+                if batch.has_reached_delivery_timeout(self._accumulator.delivery_timeout_ms):
+                    heapq.heappop(queue)
+                    if batch.final_state is None:
+                        expired_batches.append(batch)
+                    else:
+                        raise Errors.IllegalStateError("%s batch created at %s gets unexpected final state %s" % (batch.topic_partition, batch.created, batch.final_state))
+                else:
+                    self._accumulator.maybe_update_next_batch_expiry_time(batch)
+                    break
+            else:
+                # Avoid mutating in_flight_batches during iteration
+                to_remove.append(tp)
+        for tp in to_remove:
+            del self._in_flight_batches[tp]
+        return expired_batches

    def run(self):
        """The main run loop for the sender thread."""
-        log.debug("Starting Kafka producer I/O thread.")
+        log.debug("%s: Starting Kafka producer I/O thread.", str(self))

        # main loop, runs until close is called
        while self._running:
            try:
                self.run_once()
            except Exception:
-                log.exception("Uncaught error in kafka producer I/O thread")
+                log.exception("%s: Uncaught error in kafka producer I/O thread", str(self))

-        log.debug("Beginning shutdown of Kafka producer I/O thread, sending"
-                  " remaining records.")
+        log.debug("%s: Beginning shutdown of Kafka producer I/O thread, sending"
+                  " remaining records.", str(self))

        # okay we stopped accepting requests but there may still be
        # requests in the accumulator or waiting for acknowledgment,
        # wait until these are completed.
        while (not self._force_close
-               and (self._accumulator.has_unsent()
+               and (self._accumulator.has_undrained()
                    or self._client.in_flight_request_count() > 0)):
            try:
                self.run_once()
            except Exception:
-                log.exception("Uncaught error in kafka producer I/O thread")
+                log.exception("%s: Uncaught error in kafka producer I/O thread", str(self))

        if self._force_close:
            # We need to fail all the incomplete batches and wake up the
@@ -83,38 +133,75 @@ class Sender(threading.Thread):
        try:
            self._client.close()
        except Exception:
-            log.exception("Failed to close network client")
+            log.exception("%s: Failed to close network client", str(self))

-        log.debug("Shutdown of Kafka producer I/O thread has completed.")
+        log.debug("%s: Shutdown of Kafka producer I/O thread has completed.", str(self))

    def run_once(self):
        """Run a single iteration of sending."""
        while self._topics_to_add:
            self._client.add_topic(self._topics_to_add.pop())

+        if self._transaction_manager:
+            try:
+                if not self._transaction_manager.is_transactional():
+                    # this is an idempotent producer, so make sure we have a producer id
+                    self._maybe_wait_for_producer_id()
+                elif self._transaction_manager.has_in_flight_transactional_request() or self._maybe_send_transactional_request():
+                    # as long as there are outstanding transactional requests, we simply wait for them to return
+                    self._client.poll(timeout_ms=self.config['retry_backoff_ms'])
+                    return
+
+                # do not continue sending if the transaction manager is in a failed state or if there
+                # is no producer id (for the idempotent case).
+                if self._transaction_manager.has_fatal_error() or not self._transaction_manager.has_producer_id():
+                    last_error = self._transaction_manager.last_error
+                    if last_error is not None:
+                        self._maybe_abort_batches(last_error)
+                    self._client.poll(timeout_ms=self.config['retry_backoff_ms'])
+                    return
+                elif self._transaction_manager.has_abortable_error():
+                    self._accumulator.abort_undrained_batches(self._transaction_manager.last_error)
+
+            except Errors.SaslAuthenticationFailedError as e:
+                # This is already logged as error, but propagated here to perform any clean ups.
+                log.debug("%s: Authentication exception while processing transactional request: %s", str(self), e)
+                self._transaction_manager.authentication_failed(e)
+
+        poll_timeout_ms = self._send_producer_data()
+        self._client.poll(timeout_ms=poll_timeout_ms)
+
+    def _send_producer_data(self, now=None):
+        now = time.time() if now is None else now
        # get the list of partitions with data ready to send
-        result = self._accumulator.ready(self._metadata)
+        result = self._accumulator.ready(self._metadata, now=now)
        ready_nodes, next_ready_check_delay, unknown_leaders_exist = result

        # if there are any partitions whose leaders are not known yet, force
        # metadata update
        if unknown_leaders_exist:
-            log.debug('Unknown leaders exist, requesting metadata update')
+            log.debug('%s: Unknown leaders exist, requesting metadata update', str(self))
            self._metadata.request_update()

        # remove any nodes we aren't ready to send to
-        not_ready_timeout = float('inf')
+        not_ready_timeout_ms = float('inf')
        for node in list(ready_nodes):
            if not self._client.is_ready(node):
-                log.debug('Node %s not ready; delaying produce of accumulated batch', node)
+                node_delay_ms = self._client.connection_delay(node)
+                log.debug('%s: Node %s not ready; delaying produce of accumulated batch (%f ms)', str(self), node, node_delay_ms)
                self._client.maybe_connect(node, wakeup=False)
                ready_nodes.remove(node)
-                not_ready_timeout = min(not_ready_timeout,
-                                        self._client.connection_delay(node))
+                not_ready_timeout_ms = min(not_ready_timeout_ms, node_delay_ms)

        # create produce requests
        batches_by_node = self._accumulator.drain(
-            self._metadata, ready_nodes, self.config['max_request_size'])
+            self._metadata, ready_nodes, self.config['max_request_size'], now=now)
+
+        for batch_list in six.itervalues(batches_by_node):
+            for batch in batch_list:
+                item = (batch.created, batch)
+                queue = self._in_flight_batches[batch.topic_partition]
+                heapq.heappush(queue, item)

        if self.config['guarantee_message_order']:
            # Mute all the partitions drained
@@ -122,42 +209,130 @@ class Sender(threading.Thread):
                for batch in batch_list:
                    self._accumulator.muted.add(batch.topic_partition)

-        expired_batches = self._accumulator.abort_expired_batches(
-            self.config['request_timeout_ms'], self._metadata)
-        for expired_batch in expired_batches:
-            self._sensors.record_errors(expired_batch.topic_partition.topic, expired_batch.record_count)
+        self._accumulator.reset_next_batch_expiry_time()
+        expired_batches = self._accumulator.expired_batches(now=now)
+        expired_batches.extend(self._get_expired_inflight_batches(now=now))
+
+        if expired_batches:
+            log.debug("%s: Expired %s batches in accumulator", str(self), len(expired_batches))
+
+        # Reset the producer_id if an expired batch has previously been sent to the broker.
+        # See the documentation of `TransactionState.reset_producer_id` to understand why
+        # we need to reset the producer id here.
+        if self._transaction_manager and any([batch.in_retry() for batch in expired_batches]):
+            needs_transaction_state_reset = True
+        else:
+            needs_transaction_state_reset = False
+
+        for expired_batch in expired_batches:
+            error = Errors.KafkaTimeoutError(
+                "Expiring %d record(s) for %s: %s ms has passed since batch creation" % (
+                    expired_batch.record_count, expired_batch.topic_partition,
+                    int((time.time() - expired_batch.created) * 1000)))
+            self._fail_batch(expired_batch, error, base_offset=-1)
+
+        if self._sensors:
+            self._sensors.update_produce_request_metrics(batches_by_node)
+
+        if needs_transaction_state_reset:
+            self._transaction_manager.reset_producer_id()
+            return 0

-        self._sensors.update_produce_request_metrics(batches_by_node)
        requests = self._create_produce_requests(batches_by_node)
        # If we have any nodes that are ready to send + have sendable data,
        # poll with 0 timeout so this can immediately loop and try sending more
-        # data. Otherwise, the timeout is determined by nodes that have
-        # partitions with data that isn't yet sendable (e.g. lingering, backing
-        # off). Note that this specifically does not include nodes with
+        # data. Otherwise, the timeout will be the smaller value between next
+        # batch expiry time, and the delay time for checking data availability.
+        # Note that the nodes may have data that isn't yet sendable due to
+        # lingering, backing off, etc. This specifically does not include nodes with
        # sendable data that aren't ready to send since they would cause busy
        # looping.
-        poll_timeout_ms = min(next_ready_check_delay * 1000, not_ready_timeout)
+        poll_timeout_ms = min(next_ready_check_delay * 1000,
+                              not_ready_timeout_ms,
+                              self._accumulator.next_expiry_time_ms - now * 1000)
+        if poll_timeout_ms < 0:
+            poll_timeout_ms = 0
+
        if ready_nodes:
-            log.debug("Nodes with data ready to send: %s", ready_nodes) # trace
-            log.debug("Created %d produce requests: %s", len(requests), requests) # trace
+            log.debug("%s: Nodes with data ready to send: %s", str(self), ready_nodes) # trace
+            log.debug("%s: Created %d produce requests: %s", str(self), len(requests), requests) # trace
+            # if some partitions are already ready to be sent, the select time
+            # would be 0; otherwise if some partition already has some data
+            # accumulated but not ready yet, the select time will be the time
+            # difference between now and its linger expiry time; otherwise the
+            # select time will be the time difference between now and the
+            # metadata expiry time
            poll_timeout_ms = 0

        for node_id, request in six.iteritems(requests):
            batches = batches_by_node[node_id]
-            log.debug('Sending Produce Request: %r', request)
+            log.debug('%s: Sending Produce Request: %r', str(self), request)
            (self._client.send(node_id, request, wakeup=False)
                 .add_callback(
                     self._handle_produce_response, node_id, time.time(), batches)
                 .add_errback(
                     self._failed_produce, batches, node_id))
+        return poll_timeout_ms

-        # if some partitions are already ready to be sent, the select time
-        # would be 0; otherwise if some partition already has some data
-        # accumulated but not ready yet, the select time will be the time
-        # difference between now and its linger expiry time; otherwise the
-        # select time will be the time difference between now and the
-        # metadata expiry time
-        self._client.poll(timeout_ms=poll_timeout_ms)
+    def _maybe_send_transactional_request(self):
+        if self._transaction_manager.is_completing() and self._accumulator.has_incomplete:
+            if self._transaction_manager.is_aborting():
+                self._accumulator.abort_undrained_batches(Errors.KafkaError("Failing batch since transaction was aborted"))
+            # There may still be requests left which are being retried. Since we do not know whether they had
+            # been successfully appended to the broker log, we must resend them until their final status is clear.
+            # If they had been appended and we did not receive the error, then our sequence number would no longer
+            # be correct which would lead to an OutOfSequenceNumberError.
+            if not self._accumulator.flush_in_progress():
+                self._accumulator.begin_flush()
+
+        next_request_handler = self._transaction_manager.next_request_handler(self._accumulator.has_incomplete)
+        if next_request_handler is None:
+            return False
+
+        log.debug("%s: Sending transactional request %s", str(self), next_request_handler.request)
+        while not self._force_close:
+            target_node = None
+            try:
+                if next_request_handler.needs_coordinator():
+                    target_node = self._transaction_manager.coordinator(next_request_handler.coordinator_type)
+                    if target_node is None:
+                        self._transaction_manager.lookup_coordinator_for_request(next_request_handler)
+                        break
+                    elif not self._client.await_ready(target_node, timeout_ms=self.config['request_timeout_ms']):
+                        self._transaction_manager.lookup_coordinator_for_request(next_request_handler)
+                        target_node = None
+                        break
+                else:
+                    target_node = self._client.least_loaded_node()
+                    if target_node is not None and not self._client.await_ready(target_node, timeout_ms=self.config['request_timeout_ms']):
+                        target_node = None
+
+                if target_node is not None:
+                    if next_request_handler.is_retry:
+                        time.sleep(self.config['retry_backoff_ms'] / 1000)
+                    txn_correlation_id = self._transaction_manager.next_in_flight_request_correlation_id()
+                    future = self._client.send(target_node, next_request_handler.request)
+                    future.add_both(next_request_handler.on_complete, txn_correlation_id)
+                    return True
+
+            except Exception as e:
+                log.warn("%s: Got an exception when trying to find a node to send a transactional request to. Going to back off and retry: %s", str(self), e)
+                if next_request_handler.needs_coordinator():
+                    self._transaction_manager.lookup_coordinator_for_request(next_request_handler)
+                    break
+
+            time.sleep(self.config['retry_backoff_ms'] / 1000)
+            self._metadata.request_update()
+
+        if target_node is None:
+            self._transaction_manager.retry(next_request_handler)
+
+        return True
+
+    def _maybe_abort_batches(self, exc):
+        if self._accumulator.has_incomplete:
+            log.error("%s: Aborting producer batches due to fatal error: %s", str(self), exc)
+            self._accumulator.abort_batches(exc)

    def initiate_close(self):
        """Start closing the sender (won't complete until all data is sent)."""
@@ -180,82 +355,164 @@ class Sender(threading.Thread):
            self._topics_to_add.add(topic)
            self.wakeup()

+    def _maybe_wait_for_producer_id(self):
+        while not self._transaction_manager.has_producer_id():
+            try:
+                node_id = self._client.least_loaded_node()
+                if node_id is None or not self._client.await_ready(node_id):
+                    log.debug("%s, Could not find an available broker to send InitProducerIdRequest to." +
+                              " Will back off and try again.", str(self))
+                    time.sleep(self._client.least_loaded_node_refresh_ms() / 1000)
+                    continue
+                version = self._client.api_version(InitProducerIdRequest, max_version=1)
+                request = InitProducerIdRequest[version](
+                    transactional_id=self.config['transactional_id'],
+                    transaction_timeout_ms=self.config['transaction_timeout_ms'],
+                )
+                response = self._client.send_and_receive(node_id, request)
+                error_type = Errors.for_code(response.error_code)
+                if error_type is Errors.NoError:
+                    self._transaction_manager.set_producer_id_and_epoch(ProducerIdAndEpoch(response.producer_id, response.producer_epoch))
+                    break
+                elif getattr(error_type, 'retriable', False):
+                    log.debug("%s: Retriable error from InitProducerId response: %s", str(self), error_type.__name__)
+                    if getattr(error_type, 'invalid_metadata', False):
+                        self._metadata.request_update()
+                else:
+                    self._transaction_manager.transition_to_fatal_error(error_type())
+                    break
+            except Errors.KafkaConnectionError:
+                log.debug("%s: Broker %s disconnected while awaiting InitProducerId response", str(self), node_id)
+            except Errors.RequestTimedOutError:
+                log.debug("%s: InitProducerId request to node %s timed out", str(self), node_id)
+            log.debug("%s: Retry InitProducerIdRequest in %sms.", str(self), self.config['retry_backoff_ms'])
+            time.sleep(self.config['retry_backoff_ms'] / 1000)
+
    def _failed_produce(self, batches, node_id, error):
-        log.debug("Error sending produce request to node %d: %s", node_id, error) # trace
+        log.error("%s: Error sending produce request to node %d: %s", str(self), node_id, error) # trace
        for batch in batches:
-            self._complete_batch(batch, error, -1, None)
+            self._complete_batch(batch, error, -1)

    def _handle_produce_response(self, node_id, send_time, batches, response):
        """Handle a produce response."""
        # if we have a response, parse it
-        log.debug('Parsing produce response: %r', response)
+        log.debug('%s: Parsing produce response: %r', str(self), response)
        if response:
            batches_by_partition = dict([(batch.topic_partition, batch)
                                         for batch in batches])

            for topic, partitions in response.topics:
                for partition_info in partitions:
-                    global_error = None
-                    log_start_offset = None
                    if response.API_VERSION < 2:
                        partition, error_code, offset = partition_info
                        ts = None
                    elif 2 <= response.API_VERSION <= 4:
                        partition, error_code, offset, ts = partition_info
                    elif 5 <= response.API_VERSION <= 7:
-                        partition, error_code, offset, ts, log_start_offset = partition_info
+                        partition, error_code, offset, ts, _log_start_offset = partition_info
                    else:
-                        # the ignored parameter is record_error of type list[(batch_index: int, error_message: str)]
-                        partition, error_code, offset, ts, log_start_offset, _, global_error = partition_info
+                        # Currently unused / TODO: KIP-467
+                        partition, error_code, offset, ts, _log_start_offset, _record_errors, _global_error = partition_info
                    tp = TopicPartition(topic, partition)
                    error = Errors.for_code(error_code)
                    batch = batches_by_partition[tp]
-                    self._complete_batch(batch, error, offset, ts, log_start_offset, global_error)
-
-            if response.API_VERSION > 0:
-                self._sensors.record_throttle_time(response.throttle_time_ms, node=node_id)
+                    self._complete_batch(batch, error, offset, timestamp_ms=ts)

        else:
            # this is the acks = 0 case, just complete all requests
            for batch in batches:
-                self._complete_batch(batch, None, -1, None)
+                self._complete_batch(batch, None, -1)

-    def _complete_batch(self, batch, error, base_offset, timestamp_ms=None, log_start_offset=None, global_error=None):
+    def _fail_batch(self, batch, exception, base_offset=None, timestamp_ms=None):
+        exception = exception if type(exception) is not type else exception()
+        if self._transaction_manager:
+            if isinstance(exception, Errors.OutOfOrderSequenceNumberError) and \
+                    not self._transaction_manager.is_transactional() and \
+                    self._transaction_manager.has_producer_id(batch.producer_id):
+                log.error("%s: The broker received an out of order sequence number for topic-partition %s"
+                          " at offset %s. This indicates data loss on the broker, and should be investigated.",
+                          str(self), batch.topic_partition, base_offset)
+
+                # Reset the transaction state since we have hit an irrecoverable exception and cannot make any guarantees
+                # about the previously committed message. Note that this will discard the producer id and sequence
+                # numbers for all existing partitions.
+                self._transaction_manager.reset_producer_id()
+            elif isinstance(exception, (Errors.ClusterAuthorizationFailedError,
+                                        Errors.TransactionalIdAuthorizationFailedError,
+                                        Errors.ProducerFencedError,
+                                        Errors.InvalidTxnStateError)):
+                self._transaction_manager.transition_to_fatal_error(exception)
+            elif self._transaction_manager.is_transactional():
+                self._transaction_manager.transition_to_abortable_error(exception)
+
+        if self._sensors:
+            self._sensors.record_errors(batch.topic_partition.topic, batch.record_count)
+
+        if batch.done(base_offset=base_offset, timestamp_ms=timestamp_ms, exception=exception):
+            self._maybe_remove_from_inflight_batches(batch)
+            self._accumulator.deallocate(batch)
+
+    def _complete_batch(self, batch, error, base_offset, timestamp_ms=None):
        """Complete or retry the given batch of records.

        Arguments:
-            batch (RecordBatch): The record batch
+            batch (ProducerBatch): The record batch
            error (Exception): The error (or None if none)
            base_offset (int): The base offset assigned to the records if successful
            timestamp_ms (int, optional): The timestamp returned by the broker for this batch
-            log_start_offset (int): The start offset of the log at the time this produce response was created
-            global_error (str): The summarising error message
        """
        # Standardize no-error to None
        if error is Errors.NoError:
            error = None

-        if error is not None and self._can_retry(batch, error):
-            # retry
-            log.warning("Got error produce response on topic-partition %s,"
-                        " retrying (%d attempts left). Error: %s",
-                        batch.topic_partition,
-                        self.config['retries'] - batch.attempts - 1,
-                        global_error or error)
-            self._accumulator.reenqueue(batch)
-            self._sensors.record_retries(batch.topic_partition.topic, batch.record_count)
+        if error is not None:
+            if self._can_retry(batch, error):
+                # retry
+                log.warning("%s: Got error produce response on topic-partition %s,"
+                            " retrying (%s attempts left). Error: %s",
+                            str(self), batch.topic_partition,
+                            self.config['retries'] - batch.attempts - 1,
+                            error)
+
+                # If idempotence is enabled only retry the request if the batch matches our current producer id and epoch
+                if not self._transaction_manager or self._transaction_manager.producer_id_and_epoch.match(batch):
+                    log.debug("%s: Retrying batch to topic-partition %s. Sequence number: %s",
+                              str(self), batch.topic_partition,
+                              self._transaction_manager.sequence_number(batch.topic_partition) if self._transaction_manager else None)
+                    self._accumulator.reenqueue(batch)
+                    self._maybe_remove_from_inflight_batches(batch)
+                    if self._sensors:
+                        self._sensors.record_retries(batch.topic_partition.topic, batch.record_count)
+                else:
+                    log.warning("%s: Attempted to retry sending a batch but the producer id/epoch changed from %s/%s to %s/%s. This batch will be dropped",
+                                str(self), batch.producer_id, batch.producer_epoch,
+                                self._transaction_manager.producer_id_and_epoch.producer_id,
+                                self._transaction_manager.producer_id_and_epoch.epoch)
+                    self._fail_batch(batch, error, base_offset=base_offset, timestamp_ms=timestamp_ms)
+            else:
+                if error is Errors.TopicAuthorizationFailedError:
+                    error = error(batch.topic_partition.topic)
+
+                # tell the user the result of their request
+                self._fail_batch(batch, error, base_offset=base_offset, timestamp_ms=timestamp_ms)
+
+            if error is Errors.UnknownTopicOrPartitionError:
+                log.warning("%s: Received unknown topic or partition error in produce request on partition %s."
+                            " The topic/partition may not exist or the user may not have Describe access to it",
+                            str(self), batch.topic_partition)
+
+            if getattr(error, 'invalid_metadata', False):
+                self._metadata.request_update()
+
        else:
-            if error is Errors.TopicAuthorizationFailedError:
-                error = error(batch.topic_partition.topic)
+            if batch.done(base_offset=base_offset, timestamp_ms=timestamp_ms):
+                self._maybe_remove_from_inflight_batches(batch)
+                self._accumulator.deallocate(batch)

-            # tell the user the result of their request
-            batch.done(base_offset, timestamp_ms, error, log_start_offset, global_error)
-            self._accumulator.deallocate(batch)
-            if error is not None:
-                self._sensors.record_errors(batch.topic_partition.topic, batch.record_count)
-
-        if getattr(error, 'invalid_metadata', False):
-            self._metadata.request_update()
+            if self._transaction_manager and self._transaction_manager.producer_id_and_epoch.match(batch):
+                self._transaction_manager.increment_sequence_number(batch.topic_partition, batch.record_count)
+                log.debug("%s: Incremented sequence number for topic-partition %s to %s", str(self), batch.topic_partition,
+                          self._transaction_manager.sequence_number(batch.topic_partition))

        # Unmute the completed partition.
        if self.config['guarantee_message_order']:
@@ -266,8 +523,10 @@ class Sender(threading.Thread):
        We can retry a send if the error is transient and the number of
        attempts taken is fewer than the maximum allowed
        """
-        return (batch.attempts < self.config['retries']
-                and getattr(error, 'retriable', False))
+        return (not batch.has_reached_delivery_timeout(self._accumulator.delivery_timeout_ms) and
+                batch.attempts < self.config['retries'] and
+                batch.final_state is None and
+                getattr(error, 'retriable', False))

    def _create_produce_requests(self, collated):
        """
@@ -275,23 +534,24 @@ class Sender(threading.Thread):
        per-node basis.

        Arguments:
-            collated: {node_id: [RecordBatch]}
+            collated: {node_id: [ProducerBatch]}

        Returns:
-            dict: {node_id: ProduceRequest} (version depends on api_version)
+            dict: {node_id: ProduceRequest} (version depends on client api_versions)
        """
        requests = {}
        for node_id, batches in six.iteritems(collated):
-            requests[node_id] = self._produce_request(
-                node_id, self.config['acks'],
-                self.config['request_timeout_ms'], batches)
+            if batches:
+                requests[node_id] = self._produce_request(
+                    node_id, self.config['acks'],
+                    self.config['request_timeout_ms'], batches)
        return requests

    def _produce_request(self, node_id, acks, timeout, batches):
        """Create a produce request from the given record batches.

        Returns:
-            ProduceRequest (version depends on api_version)
+            ProduceRequest (version depends on client api_versions)
        """
        produce_records_by_partition = collections.defaultdict(dict)
        for batch in batches:
@@ -301,32 +561,26 @@ class Sender(threading.Thread):
            buf = batch.records.buffer()
            produce_records_by_partition[topic][partition] = buf

-        kwargs = {}
-        if self.config['api_version'] >= (2, 1):
-            version = 7
-        elif self.config['api_version'] >= (2, 0):
-            version = 6
-        elif self.config['api_version'] >= (1, 1):
-            version = 5
-        elif self.config['api_version'] >= (1, 0):
-            version = 4
-        elif self.config['api_version'] >= (0, 11):
-            version = 3
-            kwargs = dict(transactional_id=None)
-        elif self.config['api_version'] >= (0, 10):
-            version = 2
-        elif self.config['api_version'] == (0, 9):
-            version = 1
+        version = self._client.api_version(ProduceRequest, max_version=7)
+        topic_partition_data = [
+            (topic, list(partition_info.items()))
+            for topic, partition_info in six.iteritems(produce_records_by_partition)]
+        transactional_id = self._transaction_manager.transactional_id if self._transaction_manager else None
+        if version >= 3:
+            return ProduceRequest[version](
+                transactional_id=transactional_id,
+                required_acks=acks,
+                timeout=timeout,
+                topics=topic_partition_data,
+            )
        else:
-            version = 0
-        return ProduceRequest[version](
-            required_acks=acks,
-            timeout=timeout,
-            topics=[(topic, list(partition_info.items()))
-                    for topic, partition_info
-                    in six.iteritems(produce_records_by_partition)],
-            **kwargs
-        )
+            if transactional_id is not None:
+                log.warning('%s: Broker does not support ProduceRequest v3+, required for transactional_id', str(self))
+            return ProduceRequest[version](
+                required_acks=acks,
+                timeout=timeout,
+                topics=topic_partition_data,
+            )

    def wakeup(self):
        """Wake up the selector associated with this send thread."""
@@ -335,6 +589,9 @@ class Sender(threading.Thread):
    def bootstrap_connected(self):
        return self._client.bootstrap_connected()

+    def __str__(self):
+        return "<Sender client_id=%s transactional_id=%s>" % (self.config['client_id'], self.config['transactional_id'])
+

 class SenderMetrics(object):

@@ -367,15 +624,6 @@ class SenderMetrics(object):
                        sensor_name=sensor_name,
                        description='The maximum time in ms record batches spent in the record accumulator.')

-        sensor_name = 'produce-throttle-time'
-        self.produce_throttle_time_sensor = self.metrics.sensor(sensor_name)
-        self.add_metric('produce-throttle-time-avg', Avg(),
-                        sensor_name=sensor_name,
-                        description='The average throttle time in ms')
-        self.add_metric('produce-throttle-time-max', Max(),
-                        sensor_name=sensor_name,
-                        description='The maximum throttle time in ms')
-
        sensor_name = 'records-per-request'
        self.records_per_request_sensor = self.metrics.sensor(sensor_name)
        self.add_metric('record-send-rate', Rate(),
@@ -498,8 +746,9 @@ class SenderMetrics(object):
                records += batch.record_count
                total_bytes += batch.records.size_in_bytes()

-            self.records_per_request_sensor.record(records)
-            self.byte_rate_sensor.record(total_bytes)
+            if node_batch:
+                self.records_per_request_sensor.record(records)
+                self.byte_rate_sensor.record(total_bytes)

    def record_retries(self, topic, count):
        self.retry_sensor.record(count)
@@ -512,6 +761,3 @@ class SenderMetrics(object):
        sensor = self.metrics.get_sensor('topic.' + topic + '.record-errors')
        if sensor:
            sensor.record(count)
-
-    def record_throttle_time(self, throttle_time_ms, node=None):
-        self.produce_throttle_time_sensor.record(throttle_time_ms)
--- a/venv/lib/python3.12/site-packages/kafka/producer/transaction_manager.py
+++ b/venv/lib/python3.12/site-packages/kafka/producer/transaction_manager.py
@@ -0,0 +1,981 @@
+from __future__ import absolute_import, division
+
+import abc 
+import collections
+import heapq
+import logging
+import threading
+
+from kafka.vendor import six
+
+try:
+    # enum in stdlib as of py3.4
+    from enum import IntEnum  # pylint: disable=import-error
+except ImportError:
+    # vendored backport module
+    from kafka.vendor.enum34 import IntEnum
+
+import kafka.errors as Errors
+from kafka.protocol.add_offsets_to_txn import AddOffsetsToTxnRequest
+from kafka.protocol.add_partitions_to_txn import AddPartitionsToTxnRequest
+from kafka.protocol.end_txn import EndTxnRequest
+from kafka.protocol.find_coordinator import FindCoordinatorRequest
+from kafka.protocol.init_producer_id import InitProducerIdRequest
+from kafka.protocol.txn_offset_commit import TxnOffsetCommitRequest
+from kafka.structs import TopicPartition
+
+
+log = logging.getLogger(__name__)
+
+
+NO_PRODUCER_ID = -1
+NO_PRODUCER_EPOCH = -1
+NO_SEQUENCE = -1
+
+
+class ProducerIdAndEpoch(object):
+    __slots__ = ('producer_id', 'epoch')
+
+    def __init__(self, producer_id, epoch):
+        self.producer_id = producer_id
+        self.epoch = epoch
+
+    @property
+    def is_valid(self):
+        return NO_PRODUCER_ID < self.producer_id
+
+    def match(self, batch):
+        return self.producer_id == batch.producer_id and self.epoch == batch.producer_epoch
+
+    def __eq__(self, other):
+        return isinstance(other, ProducerIdAndEpoch) and self.producer_id == other.producer_id and self.epoch == other.epoch
+
+    def __str__(self):
+        return "ProducerIdAndEpoch(producer_id={}, epoch={})".format(self.producer_id, self.epoch)
+
+
+class TransactionState(IntEnum):
+    UNINITIALIZED = 0
+    INITIALIZING = 1
+    READY = 2
+    IN_TRANSACTION = 3
+    COMMITTING_TRANSACTION = 4
+    ABORTING_TRANSACTION = 5
+    ABORTABLE_ERROR = 6
+    FATAL_ERROR = 7
+
+    @classmethod
+    def is_transition_valid(cls, source, target):
+        if target == cls.INITIALIZING:
+            return source == cls.UNINITIALIZED
+        elif target == cls.READY:
+            return source in (cls.INITIALIZING, cls.COMMITTING_TRANSACTION, cls.ABORTING_TRANSACTION)
+        elif target == cls.IN_TRANSACTION:
+            return source == cls.READY
+        elif target == cls.COMMITTING_TRANSACTION:
+            return source == cls.IN_TRANSACTION
+        elif target == cls.ABORTING_TRANSACTION:
+            return source in (cls.IN_TRANSACTION, cls.ABORTABLE_ERROR)
+        elif target == cls.ABORTABLE_ERROR:
+            return source in (cls.IN_TRANSACTION, cls.COMMITTING_TRANSACTION, cls.ABORTABLE_ERROR)
+        elif target == cls.UNINITIALIZED:
+            # Disallow transitions to UNITIALIZED
+            return False
+        elif target == cls.FATAL_ERROR:
+            # We can transition to FATAL_ERROR unconditionally.
+            # FATAL_ERROR is never a valid starting state for any transition. So the only option is to close the
+            # producer or do purely non transactional requests.
+            return True
+
+
+class Priority(IntEnum):
+    # We use the priority to determine the order in which requests need to be sent out. For instance, if we have
+    # a pending FindCoordinator request, that must always go first. Next, If we need a producer id, that must go second.
+    # The endTxn request must always go last.
+    FIND_COORDINATOR = 0
+    INIT_PRODUCER_ID = 1
+    ADD_PARTITIONS_OR_OFFSETS = 2
+    END_TXN = 3
+
+
+class TransactionManager(object):
+    """
+    A class which maintains state for transactions. Also keeps the state necessary to ensure idempotent production.
+    """
+    NO_INFLIGHT_REQUEST_CORRELATION_ID = -1
+    # The retry_backoff_ms is overridden to the following value if the first AddPartitions receives a
+    # CONCURRENT_TRANSACTIONS error.
+    ADD_PARTITIONS_RETRY_BACKOFF_MS = 20
+
+    def __init__(self, transactional_id=None, transaction_timeout_ms=0, retry_backoff_ms=100, api_version=(0, 11), metadata=None):
+        self._api_version = api_version
+        self._metadata = metadata
+
+        self._sequence_numbers = collections.defaultdict(lambda: 0)
+
+        self.transactional_id = transactional_id
+        self.transaction_timeout_ms = transaction_timeout_ms
+        self._transaction_coordinator = None
+        self._consumer_group_coordinator = None
+        self._new_partitions_in_transaction = set()
+        self._pending_partitions_in_transaction = set()
+        self._partitions_in_transaction = set()
+        self._pending_txn_offset_commits = dict()
+
+        self._current_state = TransactionState.UNINITIALIZED
+        self._last_error = None
+        self.producer_id_and_epoch = ProducerIdAndEpoch(NO_PRODUCER_ID, NO_PRODUCER_EPOCH)
+
+        self._transaction_started = False
+
+        self._pending_requests = [] # priority queue via heapq
+        self._pending_requests_sort_id = 0
+        self._in_flight_request_correlation_id = self.NO_INFLIGHT_REQUEST_CORRELATION_ID
+
+        # This is used by the TxnRequestHandlers to control how long to back off before a given request is retried.
+        # For instance, this value is lowered by the AddPartitionsToTxnHandler when it receives a CONCURRENT_TRANSACTIONS
+        # error for the first AddPartitionsRequest in a transaction.
+        self.retry_backoff_ms = retry_backoff_ms
+        self._lock = threading.Condition()
+
+    def initialize_transactions(self):
+        with self._lock:
+            self._ensure_transactional()
+            self._transition_to(TransactionState.INITIALIZING)
+            self.set_producer_id_and_epoch(ProducerIdAndEpoch(NO_PRODUCER_ID, NO_PRODUCER_EPOCH))
+            self._sequence_numbers.clear()
+            handler = InitProducerIdHandler(self, self.transaction_timeout_ms)
+            self._enqueue_request(handler)
+            return handler.result
+
+    def begin_transaction(self):
+        with self._lock:
+            self._ensure_transactional()
+            self._maybe_fail_with_error()
+            self._transition_to(TransactionState.IN_TRANSACTION)
+
+    def begin_commit(self):
+        with self._lock:
+            self._ensure_transactional()
+            self._maybe_fail_with_error()
+            self._transition_to(TransactionState.COMMITTING_TRANSACTION)
+            return self._begin_completing_transaction(True)
+
+    def begin_abort(self):
+        with self._lock:
+            self._ensure_transactional()
+            if self._current_state != TransactionState.ABORTABLE_ERROR:
+                self._maybe_fail_with_error()
+            self._transition_to(TransactionState.ABORTING_TRANSACTION)
+
+            # We're aborting the transaction, so there should be no need to add new partitions
+            self._new_partitions_in_transaction.clear()
+            return self._begin_completing_transaction(False)
+
+    def _begin_completing_transaction(self, committed):
+        if self._new_partitions_in_transaction:
+            self._enqueue_request(self._add_partitions_to_transaction_handler())
+        handler = EndTxnHandler(self, committed)
+        self._enqueue_request(handler)
+        return handler.result
+
+    def send_offsets_to_transaction(self, offsets, consumer_group_id):
+        with self._lock:
+            self._ensure_transactional()
+            self._maybe_fail_with_error()
+            if self._current_state != TransactionState.IN_TRANSACTION:
+                raise Errors.KafkaError("Cannot send offsets to transaction because the producer is not in an active transaction")
+
+            log.debug("Begin adding offsets %s for consumer group %s to transaction", offsets, consumer_group_id)
+            handler = AddOffsetsToTxnHandler(self, consumer_group_id, offsets)
+            self._enqueue_request(handler)
+            return handler.result
+
+    def maybe_add_partition_to_transaction(self, topic_partition):
+        with self._lock:
+            self._fail_if_not_ready_for_send()
+
+            if self.is_partition_added(topic_partition) or self.is_partition_pending_add(topic_partition):
+                return
+
+            log.debug("Begin adding new partition %s to transaction", topic_partition)
+            self._new_partitions_in_transaction.add(topic_partition)
+
+    def _fail_if_not_ready_for_send(self):
+        with self._lock:
+            if self.has_error():
+                raise Errors.KafkaError(
+                        "Cannot perform send because at least one previous transactional or"
+                        " idempotent request has failed with errors.", self._last_error)
+
+            if self.is_transactional():
+                if not self.has_producer_id():
+                    raise Errors.IllegalStateError(
+                            "Cannot perform a 'send' before completing a call to init_transactions"
+                            " when transactions are enabled.")
+
+                if self._current_state != TransactionState.IN_TRANSACTION:
+                    raise Errors.IllegalStateError("Cannot call send in state %s" % (self._current_state.name,))
+
+    def is_send_to_partition_allowed(self, tp):
+        with self._lock:
+            if self.has_fatal_error():
+                return False
+            return not self.is_transactional() or tp in self._partitions_in_transaction
+
+    def has_producer_id(self, producer_id=None):
+        if producer_id is None:
+            return self.producer_id_and_epoch.is_valid
+        else:
+            return self.producer_id_and_epoch.producer_id == producer_id
+
+    def is_transactional(self):
+        return self.transactional_id is not None
+
+    def has_partitions_to_add(self):
+        with self._lock:
+            return bool(self._new_partitions_in_transaction) or bool(self._pending_partitions_in_transaction)
+
+    def is_completing(self):
+        with self._lock:
+            return self._current_state in (
+                TransactionState.COMMITTING_TRANSACTION,
+                TransactionState.ABORTING_TRANSACTION)
+
+    @property
+    def last_error(self):
+        return self._last_error
+
+    def has_error(self):
+        with self._lock:
+            return self._current_state in (
+                TransactionState.ABORTABLE_ERROR,
+                TransactionState.FATAL_ERROR)
+
+    def is_aborting(self):
+        with self._lock:
+            return self._current_state == TransactionState.ABORTING_TRANSACTION
+
+    def transition_to_abortable_error(self, exc):
+        with self._lock:
+            if self._current_state == TransactionState.ABORTING_TRANSACTION:
+                log.debug("Skipping transition to abortable error state since the transaction is already being "
+                          " aborted. Underlying exception: %s", exc)
+                return
+            self._transition_to(TransactionState.ABORTABLE_ERROR, error=exc)
+
+    def transition_to_fatal_error(self, exc):
+        with self._lock:
+            self._transition_to(TransactionState.FATAL_ERROR, error=exc)
+
+    def is_partition_added(self, partition):
+        with self._lock:
+            return partition in self._partitions_in_transaction
+
+    def is_partition_pending_add(self, partition):
+        return partition in self._new_partitions_in_transaction or partition in self._pending_partitions_in_transaction
+
+    def has_producer_id_and_epoch(self, producer_id, producer_epoch):
+        return (
+            self.producer_id_and_epoch.producer_id == producer_id and
+            self.producer_id_and_epoch.epoch == producer_epoch
+        )
+
+    def set_producer_id_and_epoch(self, producer_id_and_epoch):
+        if not isinstance(producer_id_and_epoch, ProducerIdAndEpoch):
+            raise TypeError("ProducerAndIdEpoch type required")
+        log.info("ProducerId set to %s with epoch %s",
+                 producer_id_and_epoch.producer_id, producer_id_and_epoch.epoch)
+        self.producer_id_and_epoch = producer_id_and_epoch
+
+    def reset_producer_id(self):
+        """
+        This method is used when the producer needs to reset its internal state because of an irrecoverable exception
+        from the broker.
+
+        We need to reset the producer id and associated state when we have sent a batch to the broker, but we either get
+        a non-retriable exception or we run out of retries, or the batch expired in the producer queue after it was already
+        sent to the broker.
+
+        In all of these cases, we don't know whether batch was actually committed on the broker, and hence whether the
+        sequence number was actually updated. If we don't reset the producer state, we risk the chance that all future
+        messages will return an OutOfOrderSequenceNumberError.
+
+        Note that we can't reset the producer state for the transactional producer as this would mean bumping the epoch
+        for the same producer id. This might involve aborting the ongoing transaction during the initProducerIdRequest,
+        and the user would not have any way of knowing this happened. So for the transactional producer,
+        it's best to return the produce error to the user and let them abort the transaction and close the producer explicitly.
+        """
+        with self._lock:
+            if self.is_transactional():
+                raise Errors.IllegalStateError( 
+                    "Cannot reset producer state for a transactional producer."
+                    " You must either abort the ongoing transaction or"
+                    " reinitialize the transactional producer instead")
+            self.set_producer_id_and_epoch(ProducerIdAndEpoch(NO_PRODUCER_ID, NO_PRODUCER_EPOCH))
+            self._sequence_numbers.clear()
+
+    def sequence_number(self, tp):
+        with self._lock:
+            return self._sequence_numbers[tp]
+
+    def increment_sequence_number(self, tp, increment):
+        with self._lock:
+            if tp not in self._sequence_numbers:
+                raise Errors.IllegalStateError("Attempt to increment sequence number for a partition with no current sequence.")
+            # Sequence number wraps at java max int
+            base = self._sequence_numbers[tp]
+            if base > (2147483647 - increment):
+              self._sequence_numbers[tp] = increment - (2147483647 - base) - 1
+            else:
+                self._sequence_numbers[tp] += increment
+
+    def next_request_handler(self, has_incomplete_batches):
+        with self._lock:
+            if self._new_partitions_in_transaction:
+                self._enqueue_request(self._add_partitions_to_transaction_handler())
+
+            if not self._pending_requests:
+                return None
+
+            _, _, next_request_handler = self._pending_requests[0]
+            # Do not send the EndTxn until all batches have been flushed
+            if isinstance(next_request_handler, EndTxnHandler) and has_incomplete_batches:
+                return None
+
+            heapq.heappop(self._pending_requests)
+            if self._maybe_terminate_request_with_error(next_request_handler):
+                log.debug("Not sending transactional request %s because we are in an error state",
+                          next_request_handler.request)
+                return None
+
+            if isinstance(next_request_handler, EndTxnHandler) and not self._transaction_started:
+                next_request_handler.result.done()
+                if self._current_state != TransactionState.FATAL_ERROR:
+                    log.debug("Not sending EndTxn for completed transaction since no partitions"
+                              " or offsets were successfully added")
+                    self._complete_transaction()
+                try:
+                    _, _, next_request_handler = heapq.heappop(self._pending_requests)
+                except IndexError:
+                    next_request_handler = None
+
+            if next_request_handler:
+                log.debug("Request %s dequeued for sending", next_request_handler.request)
+
+            return next_request_handler
+
+    def retry(self, request):
+        with self._lock:
+            request.set_retry()
+            self._enqueue_request(request)
+
+    def authentication_failed(self, exc):
+        with self._lock:
+            for _, _, request in self._pending_requests:
+                request.fatal_error(exc)
+
+    def coordinator(self, coord_type):
+        if coord_type == 'group':
+            return self._consumer_group_coordinator
+        elif coord_type == 'transaction':
+            return self._transaction_coordinator
+        else:
+            raise Errors.IllegalStateError("Received an invalid coordinator type: %s" % (coord_type,))
+
+    def lookup_coordinator_for_request(self, request):
+        self._lookup_coordinator(request.coordinator_type, request.coordinator_key)
+
+    def next_in_flight_request_correlation_id(self):
+        self._in_flight_request_correlation_id += 1
+        return self._in_flight_request_correlation_id
+
+    def clear_in_flight_transactional_request_correlation_id(self):
+        self._in_flight_request_correlation_id = self.NO_INFLIGHT_REQUEST_CORRELATION_ID
+
+    def has_in_flight_transactional_request(self):
+        return self._in_flight_request_correlation_id != self.NO_INFLIGHT_REQUEST_CORRELATION_ID
+
+    def has_fatal_error(self):
+        return self._current_state == TransactionState.FATAL_ERROR
+
+    def has_abortable_error(self):
+        return self._current_state == TransactionState.ABORTABLE_ERROR
+
+    # visible for testing
+    def _test_transaction_contains_partition(self, tp):
+        with self._lock:
+            return tp in self._partitions_in_transaction
+
+    # visible for testing
+    def _test_has_pending_offset_commits(self):
+        return bool(self._pending_txn_offset_commits)
+
+    # visible for testing
+    def _test_has_ongoing_transaction(self):
+        with self._lock:
+            # transactions are considered ongoing once started until completion or a fatal error
+            return self._current_state == TransactionState.IN_TRANSACTION or self.is_completing() or self.has_abortable_error()
+
+    # visible for testing
+    def _test_is_ready(self):
+        with self._lock:
+            return self.is_transactional() and self._current_state == TransactionState.READY
+
+    def _transition_to(self, target, error=None):
+        with self._lock:
+            if not self._current_state.is_transition_valid(self._current_state, target):
+                raise Errors.KafkaError("TransactionalId %s: Invalid transition attempted from state %s to state %s" % (
+                    self.transactional_id, self._current_state.name, target.name))
+
+            if target in (TransactionState.FATAL_ERROR, TransactionState.ABORTABLE_ERROR):
+                if error is None:
+                    raise Errors.IllegalArgumentError("Cannot transition to %s with an None exception" % (target.name,))
+                self._last_error = error
+            else:
+                self._last_error = None
+
+            if self._last_error is not None:
+                log.debug("Transition from state %s to error state %s (%s)", self._current_state.name, target.name, self._last_error)
+            else:
+                log.debug("Transition from state %s to %s", self._current_state, target)
+            self._current_state = target
+
+    def _ensure_transactional(self):
+        if not self.is_transactional():
+            raise Errors.IllegalStateError("Transactional method invoked on a non-transactional producer.")
+
+    def _maybe_fail_with_error(self):
+        if self.has_error():
+            raise Errors.KafkaError("Cannot execute transactional method because we are in an error state: %s" % (self._last_error,))
+
+    def _maybe_terminate_request_with_error(self, request_handler):
+        if self.has_error():
+            if self.has_abortable_error() and isinstance(request_handler, FindCoordinatorHandler):
+                # No harm letting the FindCoordinator request go through if we're expecting to abort
+                return False
+            request_handler.fail(self._last_error)
+            return True
+        return False
+
+    def _next_pending_requests_sort_id(self):
+        self._pending_requests_sort_id += 1
+        return self._pending_requests_sort_id
+
+    def _enqueue_request(self, request_handler):
+        log.debug("Enqueuing transactional request %s", request_handler.request)
+        heapq.heappush(
+            self._pending_requests,
+            (
+                request_handler.priority, # keep lowest priority at head of queue
+                self._next_pending_requests_sort_id(), # break ties
+                request_handler
+            )
+        )
+
+    def _lookup_coordinator(self, coord_type, coord_key):
+        with self._lock:
+            if coord_type == 'group':
+                self._consumer_group_coordinator = None
+            elif coord_type == 'transaction':
+                self._transaction_coordinator = None
+            else:
+                raise Errors.IllegalStateError("Invalid coordinator type: %s" % (coord_type,))
+        self._enqueue_request(FindCoordinatorHandler(self, coord_type, coord_key))
+
+    def _complete_transaction(self):
+        with self._lock:
+            self._transition_to(TransactionState.READY)
+            self._transaction_started = False
+            self._new_partitions_in_transaction.clear()
+            self._pending_partitions_in_transaction.clear()
+            self._partitions_in_transaction.clear()
+
+    def _add_partitions_to_transaction_handler(self):
+        with self._lock:
+            self._pending_partitions_in_transaction.update(self._new_partitions_in_transaction)
+            self._new_partitions_in_transaction.clear()
+            return AddPartitionsToTxnHandler(self, self._pending_partitions_in_transaction)
+
+
+class TransactionalRequestResult(object):
+    def __init__(self):
+        self._latch = threading.Event()
+        self._error = None
+
+    def done(self, error=None):
+        self._error = error
+        self._latch.set()
+
+    def wait(self, timeout_ms=None):
+        timeout = timeout_ms / 1000 if timeout_ms is not None else None
+        success = self._latch.wait(timeout)
+        if self._error:
+            raise self._error
+        return success
+
+    @property
+    def is_done(self):
+        return self._latch.is_set()
+
+    @property
+    def succeeded(self):
+        return self._latch.is_set() and self._error is None
+
+    @property
+    def failed(self):
+        return self._latch.is_set() and self._error is not None
+
+    @property
+    def exception(self):
+        return self._error
+
+
+@six.add_metaclass(abc.ABCMeta)
+class TxnRequestHandler(object):
+    def __init__(self, transaction_manager, result=None):
+        self.transaction_manager = transaction_manager
+        self.retry_backoff_ms = transaction_manager.retry_backoff_ms
+        self.request = None
+        self._result = result or TransactionalRequestResult()
+        self._is_retry = False
+
+    @property
+    def transactional_id(self):
+        return self.transaction_manager.transactional_id
+
+    @property
+    def producer_id(self):
+        return self.transaction_manager.producer_id_and_epoch.producer_id
+
+    @property
+    def producer_epoch(self):
+        return self.transaction_manager.producer_id_and_epoch.epoch
+
+    def fatal_error(self, exc):
+        self.transaction_manager.transition_to_fatal_error(exc)
+        self._result.done(error=exc)
+
+    def abortable_error(self, exc):
+        self.transaction_manager.transition_to_abortable_error(exc)
+        self._result.done(error=exc)
+
+    def fail(self, exc):
+        self._result.done(error=exc)
+
+    def reenqueue(self):
+        with self.transaction_manager._lock:
+            self._is_retry = True
+            self.transaction_manager._enqueue_request(self)
+
+    def on_complete(self, correlation_id, response_or_exc):
+        if correlation_id != self.transaction_manager._in_flight_request_correlation_id:
+            self.fatal_error(RuntimeError("Detected more than one in-flight transactional request."))
+        else:
+            self.transaction_manager.clear_in_flight_transactional_request_correlation_id()
+            if isinstance(response_or_exc, Errors.KafkaConnectionError):
+                log.debug("Disconnected from node. Will retry.")
+                if self.needs_coordinator():
+                    self.transaction_manager._lookup_coordinator(self.coordinator_type, self.coordinator_key)
+                self.reenqueue()
+            elif isinstance(response_or_exc, Errors.UnsupportedVersionError):
+                self.fatal_error(response_or_exc)
+            elif not isinstance(response_or_exc, (Exception, type(None))):
+                log.debug("Received transactional response %s for request %s", response_or_exc, self.request)
+                with self.transaction_manager._lock:
+                    self.handle_response(response_or_exc)
+            else:
+                self.fatal_error(Errors.KafkaError("Could not execute transactional request for unknown reasons: %s" % response_or_exc))
+
+    def needs_coordinator(self):
+        return self.coordinator_type is not None
+
+    @property
+    def result(self):
+        return self._result
+
+    @property
+    def coordinator_type(self):
+        return 'transaction'
+
+    @property
+    def coordinator_key(self):
+        return self.transaction_manager.transactional_id
+
+    def set_retry(self):
+        self._is_retry = True
+
+    @property
+    def is_retry(self):
+        return self._is_retry
+
+    @abc.abstractmethod
+    def handle_response(self, response):
+        pass
+
+    @abc.abstractproperty
+    def priority(self):
+        pass
+
+
+class InitProducerIdHandler(TxnRequestHandler):
+    def __init__(self, transaction_manager, transaction_timeout_ms):
+        super(InitProducerIdHandler, self).__init__(transaction_manager)
+
+        if transaction_manager._api_version >= (2, 0):
+            version = 1
+        else:
+            version = 0
+        self.request = InitProducerIdRequest[version](
+            transactional_id=self.transactional_id,
+            transaction_timeout_ms=transaction_timeout_ms)
+
+    @property
+    def priority(self):
+        return Priority.INIT_PRODUCER_ID
+
+    def handle_response(self, response):
+        error = Errors.for_code(response.error_code)
+
+        if error is Errors.NoError:
+            self.transaction_manager.set_producer_id_and_epoch(ProducerIdAndEpoch(response.producer_id, response.producer_epoch))
+            self.transaction_manager._transition_to(TransactionState.READY)
+            self._result.done()
+        elif error in (Errors.NotCoordinatorError, Errors.CoordinatorNotAvailableError):
+            self.transaction_manager._lookup_coordinator('transaction', self.transactional_id)
+            self.reenqueue()
+        elif error in (Errors.CoordinatorLoadInProgressError, Errors.ConcurrentTransactionsError):
+            self.reenqueue()
+        elif error is Errors.TransactionalIdAuthorizationFailedError:
+            self.fatal_error(error())
+        else:
+            self.fatal_error(Errors.KafkaError("Unexpected error in InitProducerIdResponse: %s" % (error())))
+
+class AddPartitionsToTxnHandler(TxnRequestHandler):
+    def __init__(self, transaction_manager, topic_partitions):
+        super(AddPartitionsToTxnHandler, self).__init__(transaction_manager)
+
+        if transaction_manager._api_version >= (2, 7):
+            version = 2
+        elif transaction_manager._api_version >= (2, 0):
+            version = 1
+        else:
+            version = 0
+        topic_data = collections.defaultdict(list)
+        for tp in topic_partitions:
+            topic_data[tp.topic].append(tp.partition)
+        self.request = AddPartitionsToTxnRequest[version](
+            transactional_id=self.transactional_id,
+            producer_id=self.producer_id,
+            producer_epoch=self.producer_epoch,
+            topics=list(topic_data.items()))
+
+    @property
+    def priority(self):
+        return Priority.ADD_PARTITIONS_OR_OFFSETS
+
+    def handle_response(self, response):
+        has_partition_errors = False
+        unauthorized_topics = set()
+        self.retry_backoff_ms = self.transaction_manager.retry_backoff_ms
+
+        results = {TopicPartition(topic, partition): Errors.for_code(error_code)
+                   for topic, partition_data in response.results
+                   for partition, error_code in partition_data}
+
+        for tp, error in six.iteritems(results):
+            if error is Errors.NoError:
+                continue
+            elif error in (Errors.CoordinatorNotAvailableError, Errors.NotCoordinatorError):
+                self.transaction_manager._lookup_coordinator('transaction', self.transactional_id)
+                self.reenqueue()
+                return
+            elif error is Errors.ConcurrentTransactionsError:
+                self.maybe_override_retry_backoff_ms()
+                self.reenqueue()
+                return
+            elif error in (Errors.CoordinatorLoadInProgressError, Errors.UnknownTopicOrPartitionError):
+                self.reenqueue()
+                return
+            elif error is Errors.InvalidProducerEpochError:
+                self.fatal_error(error())
+                return
+            elif error is Errors.TransactionalIdAuthorizationFailedError:
+                self.fatal_error(error())
+                return
+            elif error in (Errors.InvalidProducerIdMappingError, Errors.InvalidTxnStateError):
+                self.fatal_error(Errors.KafkaError(error()))
+                return
+            elif error is Errors.TopicAuthorizationFailedError:
+                unauthorized_topics.add(tp.topic)
+            elif error is Errors.OperationNotAttemptedError:
+                log.debug("Did not attempt to add partition %s to transaction because other partitions in the"
+                          " batch had errors.", tp)
+                has_partition_errors = True
+            else:
+                log.error("Could not add partition %s due to unexpected error %s", tp, error())
+                has_partition_errors = True
+
+        partitions = set(results)
+
+        # Remove the partitions from the pending set regardless of the result. We use the presence
+        # of partitions in the pending set to know when it is not safe to send batches. However, if
+        # the partitions failed to be added and we enter an error state, we expect the batches to be
+        # aborted anyway. In this case, we must be able to continue sending the batches which are in
+        # retry for partitions that were successfully added.
+        self.transaction_manager._pending_partitions_in_transaction -= partitions
+
+        if unauthorized_topics:
+            self.abortable_error(Errors.TopicAuthorizationFailedError(unauthorized_topics))
+        elif has_partition_errors:
+            self.abortable_error(Errors.KafkaError("Could not add partitions to transaction due to errors: %s" % (results)))
+        else:
+            log.debug("Successfully added partitions %s to transaction", partitions)
+            self.transaction_manager._partitions_in_transaction.update(partitions)
+            self.transaction_manager._transaction_started = True
+            self._result.done()
+
+    def maybe_override_retry_backoff_ms(self):
+        # We only want to reduce the backoff when retrying the first AddPartition which errored out due to a
+        # CONCURRENT_TRANSACTIONS error since this means that the previous transaction is still completing and
+        # we don't want to wait too long before trying to start the new one.
+        #
+        # This is only a temporary fix, the long term solution is being tracked in
+        # https://issues.apache.org/jira/browse/KAFKA-5482
+        if not self.transaction_manager._partitions_in_transaction:
+            self.retry_backoff_ms = min(self.transaction_manager.ADD_PARTITIONS_RETRY_BACKOFF_MS, self.retry_backoff_ms)
+
+
+class FindCoordinatorHandler(TxnRequestHandler):
+    def __init__(self, transaction_manager, coord_type, coord_key):
+        super(FindCoordinatorHandler, self).__init__(transaction_manager)
+
+        self._coord_type = coord_type
+        self._coord_key = coord_key
+        if transaction_manager._api_version >= (2, 0):
+            version = 2
+        else:
+            version = 1
+        if coord_type == 'group':
+            coord_type_int8 = 0
+        elif coord_type == 'transaction':
+            coord_type_int8 = 1
+        else:
+            raise ValueError("Unrecognized coordinator type: %s" % (coord_type,))
+        self.request = FindCoordinatorRequest[version](
+            coordinator_key=coord_key,
+            coordinator_type=coord_type_int8,
+        )
+
+    @property
+    def priority(self):
+        return Priority.FIND_COORDINATOR
+
+    @property
+    def coordinator_type(self):
+        return None
+
+    @property
+    def coordinator_key(self):
+        return None
+
+    def handle_response(self, response):
+        error = Errors.for_code(response.error_code)
+
+        if error is Errors.NoError:
+            coordinator_id = self.transaction_manager._metadata.add_coordinator(
+                response, self._coord_type, self._coord_key)
+            if self._coord_type == 'group':
+                self.transaction_manager._consumer_group_coordinator = coordinator_id
+            elif self._coord_type == 'transaction':
+                self.transaction_manager._transaction_coordinator = coordinator_id
+            self._result.done()
+        elif error is Errors.CoordinatorNotAvailableError:
+            self.reenqueue()
+        elif error is Errors.TransactionalIdAuthorizationFailedError:
+            self.fatal_error(error())
+        elif error is Errors.GroupAuthorizationFailedError:
+            self.abortable_error(error(self._coord_key))
+        else:
+            self.fatal_error(Errors.KafkaError(
+                "Could not find a coordinator with type %s with key %s due to"
+                " unexpected error: %s" % (self._coord_type, self._coord_key, error())))
+
+
+class EndTxnHandler(TxnRequestHandler):
+    def __init__(self, transaction_manager, committed):
+        super(EndTxnHandler, self).__init__(transaction_manager)
+
+        if self.transaction_manager._api_version >= (2, 7):
+            version = 2
+        elif self.transaction_manager._api_version >= (2, 0):
+            version = 1
+        else:
+            version = 0
+        self.request = EndTxnRequest[version](
+            transactional_id=self.transactional_id,
+            producer_id=self.producer_id,
+            producer_epoch=self.producer_epoch,
+            committed=committed)
+
+    @property
+    def priority(self):
+        return Priority.END_TXN
+
+    def handle_response(self, response):
+        error = Errors.for_code(response.error_code)
+
+        if error is Errors.NoError:
+            self.transaction_manager._complete_transaction()
+            self._result.done()
+        elif error in (Errors.CoordinatorNotAvailableError, Errors.NotCoordinatorError):
+            self.transaction_manager._lookup_coordinator('transaction', self.transactional_id)
+            self.reenqueue()
+        elif error in (Errors.CoordinatorLoadInProgressError, Errors.ConcurrentTransactionsError):
+            self.reenqueue()
+        elif error is Errors.InvalidProducerEpochError:
+            self.fatal_error(error())
+        elif error is Errors.TransactionalIdAuthorizationFailedError:
+            self.fatal_error(error())
+        elif error is Errors.InvalidTxnStateError:
+            self.fatal_error(error())
+        else:
+            self.fatal_error(Errors.KafkaError("Unhandled error in EndTxnResponse: %s" % (error())))
+
+
+class AddOffsetsToTxnHandler(TxnRequestHandler):
+    def __init__(self, transaction_manager, consumer_group_id, offsets):
+        super(AddOffsetsToTxnHandler, self).__init__(transaction_manager)
+
+        self.consumer_group_id = consumer_group_id
+        self.offsets = offsets
+        if self.transaction_manager._api_version >= (2, 7):
+            version = 2
+        elif self.transaction_manager._api_version >= (2, 0):
+            version = 1
+        else:
+            version = 0
+        self.request = AddOffsetsToTxnRequest[version](
+            transactional_id=self.transactional_id,
+            producer_id=self.producer_id,
+            producer_epoch=self.producer_epoch,
+            group_id=consumer_group_id)
+
+    @property
+    def priority(self):
+        return Priority.ADD_PARTITIONS_OR_OFFSETS
+
+    def handle_response(self, response):
+        error = Errors.for_code(response.error_code)
+
+        if error is Errors.NoError:
+            log.debug("Successfully added partition for consumer group %s to transaction", self.consumer_group_id)
+
+            # note the result is not completed until the TxnOffsetCommit returns
+            for tp, offset in six.iteritems(self.offsets):
+                self.transaction_manager._pending_txn_offset_commits[tp] = offset
+            handler = TxnOffsetCommitHandler(self.transaction_manager, self.consumer_group_id,
+                                             self.transaction_manager._pending_txn_offset_commits, self._result)
+            self.transaction_manager._enqueue_request(handler)
+            self.transaction_manager._transaction_started = True
+        elif error in (Errors.CoordinatorNotAvailableError, Errors.NotCoordinatorError):
+            self.transaction_manager._lookup_coordinator('transaction', self.transactional_id)
+            self.reenqueue()
+        elif error in (Errors.CoordinatorLoadInProgressError, Errors.ConcurrentTransactionsError):
+            self.reenqueue()
+        elif error is Errors.InvalidProducerEpochError:
+            self.fatal_error(error())
+        elif error is Errors.TransactionalIdAuthorizationFailedError:
+            self.fatal_error(error())
+        elif error is Errors.GroupAuthorizationFailedError:
+            self.abortable_error(error(self.consumer_group_id))
+        else:
+            self.fatal_error(Errors.KafkaError("Unexpected error in AddOffsetsToTxnResponse: %s" % (error())))
+
+
+class TxnOffsetCommitHandler(TxnRequestHandler):
+    def __init__(self, transaction_manager, consumer_group_id, offsets, result):
+        super(TxnOffsetCommitHandler, self).__init__(transaction_manager, result=result)
+
+        self.consumer_group_id = consumer_group_id
+        self.offsets = offsets
+        self.request = self._build_request()
+
+    def _build_request(self):
+        if self.transaction_manager._api_version >= (2, 1):
+            version = 2
+        elif self.transaction_manager._api_version >= (2, 0):
+            version = 1
+        else:
+            version = 0
+
+        topic_data = collections.defaultdict(list)
+        for tp, offset in six.iteritems(self.offsets):
+            if version >= 2:
+                partition_data = (tp.partition, offset.offset, offset.leader_epoch, offset.metadata)
+            else:
+                partition_data = (tp.partition, offset.offset, offset.metadata)
+            topic_data[tp.topic].append(partition_data)
+
+        return TxnOffsetCommitRequest[version](
+            transactional_id=self.transactional_id,
+            group_id=self.consumer_group_id,
+            producer_id=self.producer_id,
+            producer_epoch=self.producer_epoch,
+            topics=list(topic_data.items()))
+
+    @property
+    def priority(self):
+        return Priority.ADD_PARTITIONS_OR_OFFSETS
+
+    @property
+    def coordinator_type(self):
+        return 'group'
+
+    @property
+    def coordinator_key(self):
+        return self.consumer_group_id
+
+    def handle_response(self, response):
+        lookup_coordinator = False
+        retriable_failure = False
+
+        errors = {TopicPartition(topic, partition): Errors.for_code(error_code)
+                  for topic, partition_data in response.topics
+                  for partition, error_code in partition_data}
+
+        for tp, error in six.iteritems(errors):
+            if error is Errors.NoError:
+                log.debug("Successfully added offsets for %s from consumer group %s to transaction.",
+                          tp, self.consumer_group_id)
+                del self.transaction_manager._pending_txn_offset_commits[tp]
+            elif error in (errors.CoordinatorNotAvailableError, Errors.NotCoordinatorError, Errors.RequestTimedOutError):
+                retriable_failure = True
+                lookup_coordinator = True
+            elif error is Errors.UnknownTopicOrPartitionError:
+                retriable_failure = True
+            elif error is Errors.GroupAuthorizationFailedError:
+                self.abortable_error(error(self.consumer_group_id))
+                return
+            elif error in (Errors.TransactionalIdAuthorizationFailedError,
+                           Errors.InvalidProducerEpochError,
+                           Errors.UnsupportedForMessageFormatError):
+                self.fatal_error(error())
+                return
+            else:
+                self.fatal_error(Errors.KafkaError("Unexpected error in TxnOffsetCommitResponse: %s" % (error())))
+                return
+
+        if lookup_coordinator:
+            self.transaction_manager._lookup_coordinator('group', self.consumer_group_id)
+
+        if not retriable_failure:
+            # all attempted partitions were either successful, or there was a fatal failure.
+            # either way, we are not retrying, so complete the request.
+            self.result.done()
+
+        # retry the commits which failed with a retriable error.
+        elif self.transaction_manager._pending_txn_offset_commits:
+            self.offsets = self.transaction_manager._pending_txn_offset_commits
+            self.request = self._build_request()
+            self.reenqueue()