API refactor
All checks were successful
continuous-integration/drone/push Build is passing

This commit is contained in:
2025-10-07 16:25:52 +09:00
parent 76d0d86211
commit 91c7e04474
1171 changed files with 81940 additions and 44117 deletions

View File

@@ -1,115 +0,0 @@
from __future__ import absolute_import, division
import collections
import io
import threading
import time
from kafka.metrics.stats import Rate
import kafka.errors as Errors
class SimpleBufferPool(object):
"""A simple pool of BytesIO objects with a weak memory ceiling."""
def __init__(self, memory, poolable_size, metrics=None, metric_group_prefix='producer-metrics'):
"""Create a new buffer pool.
Arguments:
memory (int): maximum memory that this buffer pool can allocate
poolable_size (int): memory size per buffer to cache in the free
list rather than deallocating
"""
self._poolable_size = poolable_size
self._lock = threading.RLock()
buffers = int(memory / poolable_size) if poolable_size else 0
self._free = collections.deque([io.BytesIO() for _ in range(buffers)])
self._waiters = collections.deque()
self.wait_time = None
if metrics:
self.wait_time = metrics.sensor('bufferpool-wait-time')
self.wait_time.add(metrics.metric_name(
'bufferpool-wait-ratio', metric_group_prefix,
'The fraction of time an appender waits for space allocation.'),
Rate())
def allocate(self, size, max_time_to_block_ms):
"""
Allocate a buffer of the given size. This method blocks if there is not
enough memory and the buffer pool is configured with blocking mode.
Arguments:
size (int): The buffer size to allocate in bytes [ignored]
max_time_to_block_ms (int): The maximum time in milliseconds to
block for buffer memory to be available
Returns:
io.BytesIO
"""
with self._lock:
# check if we have a free buffer of the right size pooled
if self._free:
return self._free.popleft()
elif self._poolable_size == 0:
return io.BytesIO()
else:
# we are out of buffers and will have to block
buf = None
more_memory = threading.Condition(self._lock)
self._waiters.append(more_memory)
# loop over and over until we have a buffer or have reserved
# enough memory to allocate one
while buf is None:
start_wait = time.time()
more_memory.wait(max_time_to_block_ms / 1000.0)
end_wait = time.time()
if self.wait_time:
self.wait_time.record(end_wait - start_wait)
if self._free:
buf = self._free.popleft()
else:
self._waiters.remove(more_memory)
raise Errors.KafkaTimeoutError(
"Failed to allocate memory within the configured"
" max blocking time")
# remove the condition for this thread to let the next thread
# in line start getting memory
removed = self._waiters.popleft()
assert removed is more_memory, 'Wrong condition'
# signal any additional waiters if there is more memory left
# over for them
if self._free and self._waiters:
self._waiters[0].notify()
# unlock and return the buffer
return buf
def deallocate(self, buf):
"""
Return buffers to the pool. If they are of the poolable size add them
to the free list, otherwise just mark the memory as free.
Arguments:
buffer_ (io.BytesIO): The buffer to return
"""
with self._lock:
# BytesIO.truncate here makes the pool somewhat pointless
# but we stick with the BufferPool API until migrating to
# bytesarray / memoryview. The buffer we return must not
# expose any prior data on read().
buf.truncate(0)
self._free.append(buf)
if self._waiters:
self._waiters[0].notify()
def queued(self):
"""The number of threads blocked waiting on memory."""
with self._lock:
return len(self._waiters)

View File

@@ -38,7 +38,7 @@ class FutureRecordMetadata(Future):
produce_future.add_errback(self.failure)
def _produce_success(self, offset_and_timestamp):
offset, produce_timestamp_ms, log_start_offset = offset_and_timestamp
offset, produce_timestamp_ms = offset_and_timestamp
# Unpacking from args tuple is minor speed optimization
(relative_offset, timestamp_ms, checksum,
@@ -51,7 +51,7 @@ class FutureRecordMetadata(Future):
if offset != -1 and relative_offset is not None:
offset += relative_offset
tp = self._produce_future.topic_partition
metadata = RecordMetadata(tp[0], tp[1], tp, offset, timestamp_ms, log_start_offset,
metadata = RecordMetadata(tp[0], tp[1], tp, offset, timestamp_ms,
checksum, serialized_key_size,
serialized_value_size, serialized_header_size)
self.success(metadata)
@@ -67,5 +67,5 @@ class FutureRecordMetadata(Future):
RecordMetadata = collections.namedtuple(
'RecordMetadata', ['topic', 'partition', 'topic_partition', 'offset', 'timestamp', 'log_start_offset',
'RecordMetadata', ['topic', 'partition', 'topic_partition', 'offset', 'timestamp',
'checksum', 'serialized_key_size', 'serialized_value_size', 'serialized_header_size'])

View File

@@ -1,11 +1,11 @@
from __future__ import absolute_import
from __future__ import absolute_import, division
import atexit
import copy
import logging
import socket
import threading
import time
import warnings
import weakref
from kafka.vendor import six
@@ -18,10 +18,12 @@ from kafka.partitioner.default import DefaultPartitioner
from kafka.producer.future import FutureRecordMetadata, FutureProduceResult
from kafka.producer.record_accumulator import AtomicInteger, RecordAccumulator
from kafka.producer.sender import Sender
from kafka.producer.transaction_manager import TransactionManager
from kafka.record.default_records import DefaultRecordBatchBuilder
from kafka.record.legacy_records import LegacyRecordBatchBuilder
from kafka.serializer import Serializer
from kafka.structs import TopicPartition
from kafka.util import Timer, ensure_valid_topic_name
log = logging.getLogger(__name__)
@@ -34,8 +36,8 @@ class KafkaProducer(object):
The producer is thread safe and sharing a single producer instance across
threads will generally be faster than having multiple instances.
The producer consists of a pool of buffer space that holds records that
haven't yet been transmitted to the server as well as a background I/O
The producer consists of a RecordAccumulator which holds records that
haven't yet been transmitted to the server, and a Sender background I/O
thread that is responsible for turning these records into requests and
transmitting them to the cluster.
@@ -71,14 +73,50 @@ class KafkaProducer(object):
can lead to fewer, more efficient requests when not under maximal load at
the cost of a small amount of latency.
The buffer_memory controls the total amount of memory available to the
producer for buffering. If records are sent faster than they can be
transmitted to the server then this buffer space will be exhausted. When
the buffer space is exhausted additional send calls will block.
The key_serializer and value_serializer instruct how to turn the key and
value objects the user provides into bytes.
From Kafka 0.11, the KafkaProducer supports two additional modes:
the idempotent producer and the transactional producer.
The idempotent producer strengthens Kafka's delivery semantics from
at least once to exactly once delivery. In particular, producer retries
will no longer introduce duplicates. The transactional producer allows an
application to send messages to multiple partitions (and topics!)
atomically.
To enable idempotence, the `enable_idempotence` configuration must be set
to True. If set, the `retries` config will default to `float('inf')` and
the `acks` config will default to 'all'. There are no API changes for the
idempotent producer, so existing applications will not need to be modified
to take advantage of this feature.
To take advantage of the idempotent producer, it is imperative to avoid
application level re-sends since these cannot be de-duplicated. As such, if
an application enables idempotence, it is recommended to leave the
`retries` config unset, as it will be defaulted to `float('inf')`.
Additionally, if a :meth:`~kafka.KafkaProducer.send` returns an error even
with infinite retries (for instance if the message expires in the buffer
before being sent), then it is recommended to shut down the producer and
check the contents of the last produced message to ensure that it is not
duplicated. Finally, the producer can only guarantee idempotence for
messages sent within a single session.
To use the transactional producer and the attendant APIs, you must set the
`transactional_id` configuration property. If the `transactional_id` is
set, idempotence is automatically enabled along with the producer configs
which idempotence depends on. Further, topics which are included in
transactions should be configured for durability. In particular, the
`replication.factor` should be at least `3`, and the `min.insync.replicas`
for these topics should be set to 2. Finally, in order for transactional
guarantees to be realized from end-to-end, the consumers must be
configured to read only committed messages as well.
The purpose of the `transactional_id` is to enable transaction recovery
across multiple sessions of a single producer instance. It would typically
be derived from the shard identifier in a partitioned, stateful,
application. As such, it should be unique to each producer instance running
within a partitioned application.
Keyword Arguments:
bootstrap_servers: 'host[:port]' string (or list of 'host[:port]'
strings) that the producer should contact to bootstrap initial
@@ -96,6 +134,28 @@ class KafkaProducer(object):
value_serializer (callable): used to convert user-supplied message
values to bytes. If not None, called as f(value), should return
bytes. Default: None.
enable_idempotence (bool): When set to True, the producer will ensure
that exactly one copy of each message is written in the stream.
If False, producer retries due to broker failures, etc., may write
duplicates of the retried message in the stream. Default: False.
Note that enabling idempotence requires
`max_in_flight_requests_per_connection` to be set to 1 and `retries`
cannot be zero. Additionally, `acks` must be set to 'all'. If these
values are left at their defaults, the producer will override the
defaults to be suitable. If the values are set to something
incompatible with the idempotent producer, a KafkaConfigurationError
will be raised.
delivery_timeout_ms (float): An upper bound on the time to report success
or failure after producer.send() returns. This limits the total time
that a record will be delayed prior to sending, the time to await
acknowledgement from the broker (if expected), and the time allowed
for retriable send failures. The producer may report failure to send
a record earlier than this config if either an unrecoverable error is
encountered, the retries have been exhausted, or the record is added
to a batch which reached an earlier delivery expiration deadline.
The value of this config should be greater than or equal to the
sum of (request_timeout_ms + linger_ms). Default: 120000.
acks (0, 1, 'all'): The number of acknowledgments the producer requires
the leader to have received before considering a request complete.
This controls the durability of records that are sent. The
@@ -123,7 +183,7 @@ class KafkaProducer(object):
Compression is of full batches of data, so the efficacy of batching
will also impact the compression ratio (more batching means better
compression). Default: None.
retries (int): Setting a value greater than zero will cause the client
retries (numeric): Setting a value greater than zero will cause the client
to resend any record whose send fails with a potentially transient
error. Note that this retry is no different than if the client
resent the record upon receiving the error. Allowing retries
@@ -131,8 +191,12 @@ class KafkaProducer(object):
potentially change the ordering of records because if two batches
are sent to a single partition, and the first fails and is retried
but the second succeeds, then the records in the second batch may
appear first.
Default: 0.
appear first. Note additionally that produce requests will be
failed before the number of retries has been exhausted if the timeout
configured by delivery_timeout_ms expires first before successful
acknowledgement. Users should generally prefer to leave this config
unset and instead use delivery_timeout_ms to control retry behavior.
Default: float('inf') (infinite)
batch_size (int): Requests sent to brokers will contain multiple
batches, one for each partition with data available to be sent.
A small batch size will make batching less common and may reduce
@@ -165,12 +229,6 @@ class KafkaProducer(object):
messages with the same key are assigned to the same partition.
When a key is None, the message is delivered to a random partition
(filtered to partitions with available leaders only, if possible).
buffer_memory (int): The total bytes of memory the producer should use
to buffer records waiting to be sent to the server. If records are
sent faster than they can be delivered to the server the producer
will block up to max_block_ms, raising an exception on timeout.
In the current implementation, this setting is an approximation.
Default: 33554432 (32MB)
connections_max_idle_ms: Close idle connections after the number of
milliseconds specified by this config. The broker closes idle
connections after connections.max.idle.ms, so this avoids hitting
@@ -188,6 +246,9 @@ class KafkaProducer(object):
This setting will limit the number of record batches the producer
will send in a single request to avoid sending huge requests.
Default: 1048576.
allow_auto_create_topics (bool): Enable/disable auto topic creation
on metadata request. Only available with api_version >= (0, 11).
Default: True
metadata_max_age_ms (int): The period of time in milliseconds after
which we force a refresh of metadata even if we haven't seen any
partition leadership changes to proactively discover any new
@@ -216,7 +277,7 @@ class KafkaProducer(object):
reconnection attempts will continue periodically with this fixed
rate. To avoid connection storms, a randomization factor of 0.2
will be applied to the backoff resulting in a random range between
20% below and 20% above the computed value. Default: 1000.
20% below and 20% above the computed value. Default: 30000.
max_in_flight_requests_per_connection (int): Requests are pipelined
to kafka brokers up to this number of maximum requests per
broker connection. Note that if this setting is set to be greater
@@ -233,7 +294,7 @@ class KafkaProducer(object):
should verify that the certificate matches the brokers hostname.
default: true.
ssl_cafile (str): optional filename of ca file to use in certificate
veriication. default: none.
verification. default: none.
ssl_certfile (str): optional filename of file in pem format containing
the client certificate, as well as any ca certificates needed to
establish the certificate's authenticity. default: none.
@@ -252,14 +313,28 @@ class KafkaProducer(object):
or other configuration forbids use of all the specified ciphers),
an ssl.SSLError will be raised. See ssl.SSLContext.set_ciphers
api_version (tuple): Specify which Kafka API version to use. If set to
None, the client will attempt to infer the broker version by probing
various APIs. Example: (0, 10, 2). Default: None
None, the client will attempt to determine the broker version via
ApiVersionsRequest API or, for brokers earlier than 0.10, probing
various known APIs. Dynamic version checking is performed eagerly
during __init__ and can raise NoBrokersAvailableError if no connection
was made before timeout (see api_version_auto_timeout_ms below).
Different versions enable different functionality.
Examples:
(3, 9) most recent broker release, enable all supported features
(0, 11) enables message format v2 (internal)
(0, 10, 0) enables sasl authentication and message format v1
(0, 8, 0) enables basic functionality only
Default: None
api_version_auto_timeout_ms (int): number of milliseconds to throw a
timeout exception from the constructor when checking the broker
api version. Only applies if api_version set to None.
Default: 2000
metric_reporters (list): A list of classes to use as metrics reporters.
Implementing the AbstractMetricsReporter interface allows plugging
in classes that will be notified of new metric creation. Default: []
metrics_enabled (bool): Whether to track metrics on this instance. Default True.
metrics_num_samples (int): The number of samples maintained to compute
metrics. Default: 2
metrics_sample_window_ms (int): The maximum age in milliseconds of
@@ -274,33 +349,42 @@ class KafkaProducer(object):
Required if sasl_mechanism is PLAIN or one of the SCRAM mechanisms.
sasl_plain_password (str): password for sasl PLAIN and SCRAM authentication.
Required if sasl_mechanism is PLAIN or one of the SCRAM mechanisms.
sasl_kerberos_name (str or gssapi.Name): Constructed gssapi.Name for use with
sasl mechanism handshake. If provided, sasl_kerberos_service_name and
sasl_kerberos_domain name are ignored. Default: None.
sasl_kerberos_service_name (str): Service name to include in GSSAPI
sasl mechanism handshake. Default: 'kafka'
sasl_kerberos_domain_name (str): kerberos domain name to use in GSSAPI
sasl mechanism handshake. Default: one of bootstrap servers
sasl_oauth_token_provider (AbstractTokenProvider): OAuthBearer token provider
instance. (See kafka.oauth.abstract). Default: None
sasl_oauth_token_provider (kafka.sasl.oauth.AbstractTokenProvider): OAuthBearer
token provider instance. Default: None
socks5_proxy (str): Socks5 proxy URL. Default: None
kafka_client (callable): Custom class / callable for creating KafkaClient instances
Note:
Configuration parameters are described in more detail at
https://kafka.apache.org/0100/configuration.html#producerconfigs
https://kafka.apache.org/0100/documentation/#producerconfigs
"""
DEFAULT_CONFIG = {
'bootstrap_servers': 'localhost',
'client_id': None,
'key_serializer': None,
'value_serializer': None,
'enable_idempotence': False,
'transactional_id': None,
'transaction_timeout_ms': 60000,
'delivery_timeout_ms': 120000,
'acks': 1,
'bootstrap_topics_filter': set(),
'compression_type': None,
'retries': 0,
'retries': float('inf'),
'batch_size': 16384,
'linger_ms': 0,
'partitioner': DefaultPartitioner(),
'buffer_memory': 33554432,
'connections_max_idle_ms': 9 * 60 * 1000,
'max_block_ms': 60000,
'max_request_size': 1048576,
'allow_auto_create_topics': True,
'metadata_max_age_ms': 300000,
'retry_backoff_ms': 100,
'request_timeout_ms': 30000,
@@ -310,7 +394,7 @@ class KafkaProducer(object):
'sock_chunk_bytes': 4096, # undocumented experimental option
'sock_chunk_buffer_count': 1000, # undocumented experimental option
'reconnect_backoff_ms': 50,
'reconnect_backoff_max_ms': 1000,
'reconnect_backoff_max_ms': 30000,
'max_in_flight_requests_per_connection': 5,
'security_protocol': 'PLAINTEXT',
'ssl_context': None,
@@ -324,17 +408,23 @@ class KafkaProducer(object):
'api_version': None,
'api_version_auto_timeout_ms': 2000,
'metric_reporters': [],
'metrics_enabled': True,
'metrics_num_samples': 2,
'metrics_sample_window_ms': 30000,
'selector': selectors.DefaultSelector,
'sasl_mechanism': None,
'sasl_plain_username': None,
'sasl_plain_password': None,
'sasl_kerberos_name': None,
'sasl_kerberos_service_name': 'kafka',
'sasl_kerberos_domain_name': None,
'sasl_oauth_token_provider': None
'sasl_oauth_token_provider': None,
'socks5_proxy': None,
'kafka_client': KafkaClient,
}
DEPRECATED_CONFIGS = ('buffer_memory',)
_COMPRESSORS = {
'gzip': (has_gzip, LegacyRecordBatchBuilder.CODEC_GZIP),
'snappy': (has_snappy, LegacyRecordBatchBuilder.CODEC_SNAPPY),
@@ -344,12 +434,17 @@ class KafkaProducer(object):
}
def __init__(self, **configs):
log.debug("Starting the Kafka producer") # trace
self.config = copy.copy(self.DEFAULT_CONFIG)
user_provided_configs = set(configs.keys())
for key in self.config:
if key in configs:
self.config[key] = configs.pop(key)
for key in self.DEPRECATED_CONFIGS:
if key in configs:
configs.pop(key)
warnings.warn('Deprecated Producer config: %s' % (key,), DeprecationWarning)
# Only check for extra config keys in top-level class
assert not configs, 'Unrecognized configs: %s' % (configs,)
@@ -367,30 +462,35 @@ class KafkaProducer(object):
self.config['api_version'] = None
else:
self.config['api_version'] = tuple(map(int, deprecated.split('.')))
log.warning('use api_version=%s [tuple] -- "%s" as str is deprecated',
str(self.config['api_version']), deprecated)
log.warning('%s: use api_version=%s [tuple] -- "%s" as str is deprecated',
str(self), str(self.config['api_version']), deprecated)
log.debug("%s: Starting Kafka producer", str(self))
# Configure metrics
metrics_tags = {'client-id': self.config['client_id']}
metric_config = MetricConfig(samples=self.config['metrics_num_samples'],
time_window_ms=self.config['metrics_sample_window_ms'],
tags=metrics_tags)
reporters = [reporter() for reporter in self.config['metric_reporters']]
self._metrics = Metrics(metric_config, reporters)
if self.config['metrics_enabled']:
metrics_tags = {'client-id': self.config['client_id']}
metric_config = MetricConfig(samples=self.config['metrics_num_samples'],
time_window_ms=self.config['metrics_sample_window_ms'],
tags=metrics_tags)
reporters = [reporter() for reporter in self.config['metric_reporters']]
self._metrics = Metrics(metric_config, reporters)
else:
self._metrics = None
client = KafkaClient(metrics=self._metrics, metric_group_prefix='producer',
wakeup_timeout_ms=self.config['max_block_ms'],
**self.config)
client = self.config['kafka_client'](
metrics=self._metrics, metric_group_prefix='producer',
wakeup_timeout_ms=self.config['max_block_ms'],
**self.config)
# Get auto-discovered version from client if necessary
if self.config['api_version'] is None:
self.config['api_version'] = client.config['api_version']
# Get auto-discovered / normalized version from client
self.config['api_version'] = client.config['api_version']
if self.config['compression_type'] == 'lz4':
assert self.config['api_version'] >= (0, 8, 2), 'LZ4 Requires >= Kafka 0.8.2 Brokers'
if self.config['compression_type'] == 'zstd':
assert self.config['api_version'] >= (2, 1, 0), 'Zstd Requires >= Kafka 2.1.0 Brokers'
assert self.config['api_version'] >= (2, 1), 'Zstd Requires >= Kafka 2.1 Brokers'
# Check compression_type for library support
ct = self.config['compression_type']
@@ -401,12 +501,58 @@ class KafkaProducer(object):
assert checker(), "Libraries for {} compression codec not found".format(ct)
self.config['compression_attrs'] = compression_attrs
message_version = self._max_usable_produce_magic()
self._accumulator = RecordAccumulator(message_version=message_version, metrics=self._metrics, **self.config)
self._metadata = client.cluster
self._transaction_manager = None
self._init_transactions_result = None
if 'enable_idempotence' in user_provided_configs and not self.config['enable_idempotence'] and self.config['transactional_id']:
raise Errors.KafkaConfigurationError("Cannot set transactional_id without enable_idempotence.")
if self.config['transactional_id']:
self.config['enable_idempotence'] = True
if self.config['enable_idempotence']:
assert self.config['api_version'] >= (0, 11), "Transactional/Idempotent producer requires >= Kafka 0.11 Brokers"
self._transaction_manager = TransactionManager(
transactional_id=self.config['transactional_id'],
transaction_timeout_ms=self.config['transaction_timeout_ms'],
retry_backoff_ms=self.config['retry_backoff_ms'],
api_version=self.config['api_version'],
metadata=self._metadata,
)
if self._transaction_manager.is_transactional():
log.info("%s: Instantiated a transactional producer.", str(self))
else:
log.info("%s: Instantiated an idempotent producer.", str(self))
if self.config['retries'] == 0:
raise Errors.KafkaConfigurationError("Must set 'retries' to non-zero when using the idempotent producer.")
if 'max_in_flight_requests_per_connection' not in user_provided_configs:
log.info("%s: Overriding the default 'max_in_flight_requests_per_connection' to 1 since idempontence is enabled.", str(self))
self.config['max_in_flight_requests_per_connection'] = 1
elif self.config['max_in_flight_requests_per_connection'] != 1:
raise Errors.KafkaConfigurationError("Must set 'max_in_flight_requests_per_connection' to 1 in order"
" to use the idempotent producer."
" Otherwise we cannot guarantee idempotence.")
if 'acks' not in user_provided_configs:
log.info("%s: Overriding the default 'acks' config to 'all' since idempotence is enabled", str(self))
self.config['acks'] = -1
elif self.config['acks'] != -1:
raise Errors.KafkaConfigurationError("Must set 'acks' config to 'all' in order to use the idempotent"
" producer. Otherwise we cannot guarantee idempotence")
message_version = self.max_usable_produce_magic(self.config['api_version'])
self._accumulator = RecordAccumulator(
transaction_manager=self._transaction_manager,
message_version=message_version,
**self.config)
guarantee_message_order = bool(self.config['max_in_flight_requests_per_connection'] == 1)
self._sender = Sender(client, self._metadata,
self._accumulator, self._metrics,
self._accumulator,
metrics=self._metrics,
transaction_manager=self._transaction_manager,
guarantee_message_order=guarantee_message_order,
**self.config)
self._sender.daemon = True
@@ -415,7 +561,7 @@ class KafkaProducer(object):
self._cleanup = self._cleanup_factory()
atexit.register(self._cleanup)
log.debug("Kafka producer started")
log.debug("%s: Kafka producer started", str(self))
def bootstrap_connected(self):
"""Return True if the bootstrap is connected."""
@@ -426,7 +572,7 @@ class KafkaProducer(object):
_self = weakref.proxy(self)
def wrapper():
try:
_self.close(timeout=0)
_self.close(timeout=0, null_logger=True)
except (ReferenceError, AttributeError):
pass
return wrapper
@@ -449,28 +595,28 @@ class KafkaProducer(object):
self._cleanup = None
def __del__(self):
# Disable logger during destruction to avoid touching dangling references
class NullLogger(object):
def __getattr__(self, name):
return lambda *args: None
self.close(timeout=1, null_logger=True)
global log
log = NullLogger()
self.close()
def close(self, timeout=None):
def close(self, timeout=None, null_logger=False):
"""Close this producer.
Arguments:
timeout (float, optional): timeout in seconds to wait for completion.
"""
if null_logger:
# Disable logger during destruction to avoid touching dangling references
class NullLogger(object):
def __getattr__(self, name):
return lambda *args: None
global log
log = NullLogger()
# drop our atexit handler now to avoid leaks
self._unregister_cleanup()
if not hasattr(self, '_closed') or self._closed:
log.info('Kafka producer closed')
log.info('%s: Kafka producer closed', str(self))
return
if timeout is None:
# threading.TIMEOUT_MAX is available in Python3.3+
@@ -480,15 +626,16 @@ class KafkaProducer(object):
else:
assert timeout >= 0
log.info("Closing the Kafka producer with %s secs timeout.", timeout)
log.info("%s: Closing the Kafka producer with %s secs timeout.", str(self), timeout)
self.flush(timeout)
invoked_from_callback = bool(threading.current_thread() is self._sender)
if timeout > 0:
if invoked_from_callback:
log.warning("Overriding close timeout %s secs to 0 in order to"
log.warning("%s: Overriding close timeout %s secs to 0 in order to"
" prevent useless blocking due to self-join. This"
" means you have incorrectly invoked close with a"
" non-zero timeout from the producer call-back.",
timeout)
str(self), timeout)
else:
# Try to close gracefully.
if self._sender is not None:
@@ -496,12 +643,13 @@ class KafkaProducer(object):
self._sender.join(timeout)
if self._sender is not None and self._sender.is_alive():
log.info("Proceeding to force close the producer since pending"
log.info("%s: Proceeding to force close the producer since pending"
" requests could not be completed within timeout %s.",
timeout)
str(self), timeout)
self._sender.force_close()
self._metrics.close()
if self._metrics:
self._metrics.close()
try:
self.config['key_serializer'].close()
except AttributeError:
@@ -511,23 +659,23 @@ class KafkaProducer(object):
except AttributeError:
pass
self._closed = True
log.debug("The Kafka producer has closed.")
log.debug("%s: The Kafka producer has closed.", str(self))
def partitions_for(self, topic):
"""Returns set of all known partitions for the topic."""
max_wait = self.config['max_block_ms'] / 1000.0
return self._wait_on_metadata(topic, max_wait)
return self._wait_on_metadata(topic, self.config['max_block_ms'])
def _max_usable_produce_magic(self):
if self.config['api_version'] >= (0, 11):
@classmethod
def max_usable_produce_magic(cls, api_version):
if api_version >= (0, 11):
return 2
elif self.config['api_version'] >= (0, 10):
elif api_version >= (0, 10, 0):
return 1
else:
return 0
def _estimate_size_in_bytes(self, key, value, headers=[]):
magic = self._max_usable_produce_magic()
magic = self.max_usable_produce_magic(self.config['api_version'])
if magic == 2:
return DefaultRecordBatchBuilder.estimate_size_in_bytes(
key, value, headers)
@@ -535,6 +683,114 @@ class KafkaProducer(object):
return LegacyRecordBatchBuilder.estimate_size_in_bytes(
magic, self.config['compression_type'], key, value)
def init_transactions(self):
"""
Needs to be called before any other methods when the transactional.id is set in the configuration.
This method does the following:
1. Ensures any transactions initiated by previous instances of the producer with the same
transactional_id are completed. If the previous instance had failed with a transaction in
progress, it will be aborted. If the last transaction had begun completion,
but not yet finished, this method awaits its completion.
2. Gets the internal producer id and epoch, used in all future transactional
messages issued by the producer.
Note that this method will raise KafkaTimeoutError if the transactional state cannot
be initialized before expiration of `max_block_ms`.
Retrying after a KafkaTimeoutError will continue to wait for the prior request to succeed or fail.
Retrying after any other exception will start a new initialization attempt.
Retrying after a successful initialization will do nothing.
Raises:
IllegalStateError: if no transactional_id has been configured
AuthorizationError: fatal error indicating that the configured
transactional_id is not authorized.
KafkaError: if the producer has encountered a previous fatal error or for any other unexpected error
KafkaTimeoutError: if the time taken for initialize the transaction has surpassed `max.block.ms`.
"""
if not self._transaction_manager:
raise Errors.IllegalStateError("Cannot call init_transactions without setting a transactional_id.")
if self._init_transactions_result is None:
self._init_transactions_result = self._transaction_manager.initialize_transactions()
self._sender.wakeup()
try:
if not self._init_transactions_result.wait(timeout_ms=self.config['max_block_ms']):
raise Errors.KafkaTimeoutError("Timeout expired while initializing transactional state in %s ms." % (self.config['max_block_ms'],))
finally:
if self._init_transactions_result.failed:
self._init_transactions_result = None
def begin_transaction(self):
""" Should be called before the start of each new transaction.
Note that prior to the first invocation of this method,
you must invoke `init_transactions()` exactly one time.
Raises:
ProducerFencedError if another producer is with the same
transactional_id is active.
"""
# Set the transactional bit in the producer.
if not self._transaction_manager:
raise Errors.IllegalStateError("Cannot use transactional methods without enabling transactions")
self._transaction_manager.begin_transaction()
def send_offsets_to_transaction(self, offsets, consumer_group_id):
"""
Sends a list of consumed offsets to the consumer group coordinator, and also marks
those offsets as part of the current transaction. These offsets will be considered
consumed only if the transaction is committed successfully.
This method should be used when you need to batch consumed and produced messages
together, typically in a consume-transform-produce pattern.
Arguments:
offsets ({TopicPartition: OffsetAndMetadata}): map of topic-partition -> offsets to commit
as part of current transaction.
consumer_group_id (str): Name of consumer group for offsets commit.
Raises:
IllegalStateError: if no transactional_id, or transaction has not been started.
ProducerFencedError: fatal error indicating another producer with the same transactional_id is active.
UnsupportedVersionError: fatal error indicating the broker does not support transactions (i.e. if < 0.11).
UnsupportedForMessageFormatError: fatal error indicating the message format used for the offsets
topic on the broker does not support transactions.
AuthorizationError: fatal error indicating that the configured transactional_id is not authorized.
KafkaErro:r if the producer has encountered a previous fatal or abortable error, or for any
other unexpected error
"""
if not self._transaction_manager:
raise Errors.IllegalStateError("Cannot use transactional methods without enabling transactions")
result = self._transaction_manager.send_offsets_to_transaction(offsets, consumer_group_id)
self._sender.wakeup()
result.wait()
def commit_transaction(self):
""" Commits the ongoing transaction.
Raises: ProducerFencedError if another producer with the same
transactional_id is active.
"""
if not self._transaction_manager:
raise Errors.IllegalStateError("Cannot commit transaction since transactions are not enabled")
result = self._transaction_manager.begin_commit()
self._sender.wakeup()
result.wait()
def abort_transaction(self):
""" Aborts the ongoing transaction.
Raises: ProducerFencedError if another producer with the same
transactional_id is active.
"""
if not self._transaction_manager:
raise Errors.IllegalStateError("Cannot abort transaction since transactions are not enabled.")
result = self._transaction_manager.begin_abort()
self._sender.wakeup()
result.wait()
def send(self, topic, value=None, key=None, headers=None, partition=None, timestamp_ms=None):
"""Publish a message to a topic.
@@ -567,44 +823,58 @@ class KafkaProducer(object):
Raises:
KafkaTimeoutError: if unable to fetch topic metadata, or unable
to obtain memory buffer prior to configured max_block_ms
TypeError: if topic is not a string
ValueError: if topic is invalid: must be chars (a-zA-Z0-9._-), and less than 250 length
AssertionError: if KafkaProducer is closed, or key and value are both None
"""
assert not self._closed, 'KafkaProducer already closed!'
assert value is not None or self.config['api_version'] >= (0, 8, 1), (
'Null messages require kafka >= 0.8.1')
assert not (value is None and key is None), 'Need at least one: key or value'
ensure_valid_topic_name(topic)
key_bytes = value_bytes = None
timer = Timer(self.config['max_block_ms'], "Failed to assign partition for message in max_block_ms.")
try:
self._wait_on_metadata(topic, self.config['max_block_ms'] / 1000.0)
assigned_partition = None
while assigned_partition is None and not timer.expired:
self._wait_on_metadata(topic, timer.timeout_ms)
key_bytes = self._serialize(
self.config['key_serializer'],
topic, key)
value_bytes = self._serialize(
self.config['value_serializer'],
topic, value)
assert type(key_bytes) in (bytes, bytearray, memoryview, type(None))
assert type(value_bytes) in (bytes, bytearray, memoryview, type(None))
key_bytes = self._serialize(
self.config['key_serializer'],
topic, key)
value_bytes = self._serialize(
self.config['value_serializer'],
topic, value)
assert type(key_bytes) in (bytes, bytearray, memoryview, type(None))
assert type(value_bytes) in (bytes, bytearray, memoryview, type(None))
partition = self._partition(topic, partition, key, value,
key_bytes, value_bytes)
assigned_partition = self._partition(topic, partition, key, value,
key_bytes, value_bytes)
if assigned_partition is None:
raise Errors.KafkaTimeoutError("Failed to assign partition for message after %s secs." % timer.elapsed_ms / 1000)
else:
partition = assigned_partition
if headers is None:
headers = []
assert type(headers) == list
assert all(type(item) == tuple and len(item) == 2 and type(item[0]) == str and type(item[1]) == bytes for item in headers)
assert isinstance(headers, list)
assert all(isinstance(item, tuple) and len(item) == 2 and isinstance(item[0], str) and isinstance(item[1], bytes) for item in headers)
message_size = self._estimate_size_in_bytes(key_bytes, value_bytes, headers)
self._ensure_valid_record_size(message_size)
tp = TopicPartition(topic, partition)
log.debug("Sending (key=%r value=%r headers=%r) to %s", key, value, headers, tp)
log.debug("%s: Sending (key=%r value=%r headers=%r) to %s", str(self), key, value, headers, tp)
if self._transaction_manager and self._transaction_manager.is_transactional():
self._transaction_manager.maybe_add_partition_to_transaction(tp)
result = self._accumulator.append(tp, timestamp_ms,
key_bytes, value_bytes, headers,
self.config['max_block_ms'],
estimated_size=message_size)
key_bytes, value_bytes, headers)
future, batch_is_full, new_batch_created = result
if batch_is_full or new_batch_created:
log.debug("Waking up the sender since %s is either full or"
" getting a new batch", tp)
log.debug("%s: Waking up the sender since %s is either full or"
" getting a new batch", str(self), tp)
self._sender.wakeup()
return future
@@ -612,7 +882,7 @@ class KafkaProducer(object):
# for API exceptions return them in the future,
# for other exceptions raise directly
except Errors.BrokerResponseError as e:
log.debug("Exception occurred during message send: %s", e)
log.error("%s: Exception occurred during message send: %s", str(self), e)
return FutureRecordMetadata(
FutureProduceResult(TopicPartition(topic, partition)),
-1, None, None,
@@ -643,7 +913,7 @@ class KafkaProducer(object):
KafkaTimeoutError: failure to flush buffered records within the
provided timeout
"""
log.debug("Flushing accumulated records in producer.") # trace
log.debug("%s: Flushing accumulated records in producer.", str(self))
self._accumulator.begin_flush()
self._sender.wakeup()
self._accumulator.await_flush_completion(timeout=timeout)
@@ -655,13 +925,8 @@ class KafkaProducer(object):
"The message is %d bytes when serialized which is larger than"
" the maximum request size you have configured with the"
" max_request_size configuration" % (size,))
if size > self.config['buffer_memory']:
raise Errors.MessageSizeTooLargeError(
"The message is %d bytes when serialized which is larger than"
" the total memory buffer you have configured with the"
" buffer_memory configuration." % (size,))
def _wait_on_metadata(self, topic, max_wait):
def _wait_on_metadata(self, topic, max_wait_ms):
"""
Wait for cluster metadata including partitions for the given topic to
be available.
@@ -679,32 +944,31 @@ class KafkaProducer(object):
"""
# add topic to metadata topic list if it is not there already.
self._sender.add_topic(topic)
begin = time.time()
elapsed = 0.0
timer = Timer(max_wait_ms, "Failed to update metadata after %.1f secs." % (max_wait_ms / 1000,))
metadata_event = None
while True:
partitions = self._metadata.partitions_for_topic(topic)
if partitions is not None:
return partitions
timer.maybe_raise()
if not metadata_event:
metadata_event = threading.Event()
log.debug("Requesting metadata update for topic %s", topic)
log.debug("%s: Requesting metadata update for topic %s", str(self), topic)
metadata_event.clear()
future = self._metadata.request_update()
future.add_both(lambda e, *args: e.set(), metadata_event)
self._sender.wakeup()
metadata_event.wait(max_wait - elapsed)
elapsed = time.time() - begin
if not metadata_event.is_set():
metadata_event.wait(timer.timeout_ms / 1000)
if not future.is_done:
raise Errors.KafkaTimeoutError(
"Failed to update metadata after %.1f secs." % (max_wait,))
"Failed to update metadata after %.1f secs." % (max_wait_ms / 1000,))
elif future.failed() and not future.retriable():
raise future.exception
elif topic in self._metadata.unauthorized_topics:
raise Errors.TopicAuthorizationFailedError(topic)
raise Errors.TopicAuthorizationFailedError(set([topic]))
else:
log.debug("_wait_on_metadata woke after %s secs.", elapsed)
log.debug("%s: _wait_on_metadata woke after %s secs.", str(self), timer.elapsed_ms / 1000)
def _serialize(self, f, topic, data):
if not f:
@@ -715,16 +979,18 @@ class KafkaProducer(object):
def _partition(self, topic, partition, key, value,
serialized_key, serialized_value):
all_partitions = self._metadata.partitions_for_topic(topic)
available = self._metadata.available_partitions_for_topic(topic)
if all_partitions is None or available is None:
return None
if partition is not None:
assert partition >= 0
assert partition in self._metadata.partitions_for_topic(topic), 'Unrecognized partition'
assert partition in all_partitions, 'Unrecognized partition'
return partition
all_partitions = sorted(self._metadata.partitions_for_topic(topic))
available = list(self._metadata.available_partitions_for_topic(topic))
return self.config['partitioner'](serialized_key,
all_partitions,
available)
sorted(all_partitions),
list(available))
def metrics(self, raw=False):
"""Get metrics on producer performance.
@@ -736,6 +1002,8 @@ class KafkaProducer(object):
This is an unstable interface. It may change in future
releases without warning.
"""
if not self._metrics:
return
if raw:
return self._metrics.metrics.copy()
@@ -747,3 +1015,6 @@ class KafkaProducer(object):
metrics[k.group][k.name] = {}
metrics[k.group][k.name] = v.value()
return metrics
def __str__(self):
return "<KafkaProducer client_id=%s transactional_id=%s>" % (self.config['client_id'], self.config['transactional_id'])

View File

@@ -1,4 +1,4 @@
from __future__ import absolute_import
from __future__ import absolute_import, division
import collections
import copy
@@ -6,8 +6,14 @@ import logging
import threading
import time
try:
# enum in stdlib as of py3.4
from enum import IntEnum # pylint: disable=import-error
except ImportError:
# vendored backport module
from kafka.vendor.enum34 import IntEnum
import kafka.errors as Errors
from kafka.producer.buffer import SimpleBufferPool
from kafka.producer.future import FutureRecordMetadata, FutureProduceResult
from kafka.record.memory_records import MemoryRecordsBuilder
from kafka.structs import TopicPartition
@@ -35,10 +41,16 @@ class AtomicInteger(object):
return self._val
class FinalState(IntEnum):
ABORTED = 0
FAILED = 1
SUCCEEDED = 2
class ProducerBatch(object):
def __init__(self, tp, records, buffer):
def __init__(self, tp, records, now=None):
now = time.time() if now is None else now
self.max_record_size = 0
now = time.time()
self.created = now
self.drained = None
self.attempts = 0
@@ -48,81 +60,120 @@ class ProducerBatch(object):
self.topic_partition = tp
self.produce_future = FutureProduceResult(tp)
self._retry = False
self._buffer = buffer # We only save it, we don't write to it
self._final_state = None
@property
def final_state(self):
return self._final_state
@property
def record_count(self):
return self.records.next_offset()
def try_append(self, timestamp_ms, key, value, headers):
@property
def producer_id(self):
return self.records.producer_id if self.records else None
@property
def producer_epoch(self):
return self.records.producer_epoch if self.records else None
@property
def has_sequence(self):
return self.records.has_sequence if self.records else False
def try_append(self, timestamp_ms, key, value, headers, now=None):
metadata = self.records.append(timestamp_ms, key, value, headers)
if metadata is None:
return None
now = time.time() if now is None else now
self.max_record_size = max(self.max_record_size, metadata.size)
self.last_append = time.time()
future = FutureRecordMetadata(self.produce_future, metadata.offset,
metadata.timestamp, metadata.crc,
len(key) if key is not None else -1,
len(value) if value is not None else -1,
sum(len(h_key.encode("utf-8")) + len(h_val) for h_key, h_val in headers) if headers else -1)
self.last_append = now
future = FutureRecordMetadata(
self.produce_future,
metadata.offset,
metadata.timestamp,
metadata.crc,
len(key) if key is not None else -1,
len(value) if value is not None else -1,
sum(len(h_key.encode("utf-8")) + len(h_val) for h_key, h_val in headers) if headers else -1)
return future
def done(self, base_offset=None, timestamp_ms=None, exception=None, log_start_offset=None, global_error=None):
level = logging.DEBUG if exception is None else logging.WARNING
log.log(level, "Produced messages to topic-partition %s with base offset"
" %s log start offset %s and error %s.", self.topic_partition, base_offset,
log_start_offset, global_error) # trace
def abort(self, exception):
"""Abort the batch and complete the future and callbacks."""
if self._final_state is not None:
raise Errors.IllegalStateError("Batch has already been completed in final state: %s" % self._final_state)
self._final_state = FinalState.ABORTED
log.debug("Aborting batch for partition %s: %s", self.topic_partition, exception)
self._complete_future(-1, -1, exception)
def done(self, base_offset=None, timestamp_ms=None, exception=None):
"""
Finalize the state of a batch. Final state, once set, is immutable. This function may be called
once or twice on a batch. It may be called twice if
1. An inflight batch expires before a response from the broker is received. The batch's final
state is set to FAILED. But it could succeed on the broker and second time around batch.done() may
try to set SUCCEEDED final state.
2. If a transaction abortion happens or if the producer is closed forcefully, the final state is
ABORTED but again it could succeed if broker responds with a success.
Attempted transitions from [FAILED | ABORTED] --> SUCCEEDED are logged.
Attempted transitions from one failure state to the same or a different failed state are ignored.
Attempted transitions from SUCCEEDED to the same or a failed state throw an exception.
"""
final_state = FinalState.SUCCEEDED if exception is None else FinalState.FAILED
if self._final_state is None:
self._final_state = final_state
if final_state is FinalState.SUCCEEDED:
log.debug("Successfully produced messages to %s with base offset %s", self.topic_partition, base_offset)
else:
log.warning("Failed to produce messages to topic-partition %s with base offset %s: %s",
self.topic_partition, base_offset, exception)
self._complete_future(base_offset, timestamp_ms, exception)
return True
elif self._final_state is not FinalState.SUCCEEDED:
if final_state is FinalState.SUCCEEDED:
# Log if a previously unsuccessful batch succeeded later on.
log.debug("ProduceResponse returned %s for %s after batch with base offset %s had already been %s.",
final_state, self.topic_partition, base_offset, self._final_state)
else:
# FAILED --> FAILED and ABORTED --> FAILED transitions are ignored.
log.debug("Ignored state transition %s -> %s for %s batch with base offset %s",
self._final_state, final_state, self.topic_partition, base_offset)
else:
# A SUCCESSFUL batch must not attempt another state change.
raise Errors.IllegalStateError("A %s batch must not attempt another state change to %s" % (self._final_state, final_state))
return False
def _complete_future(self, base_offset, timestamp_ms, exception):
if self.produce_future.is_done:
log.warning('Batch is already closed -- ignoring batch.done()')
return
raise Errors.IllegalStateError('Batch is already closed!')
elif exception is None:
self.produce_future.success((base_offset, timestamp_ms, log_start_offset))
self.produce_future.success((base_offset, timestamp_ms))
else:
self.produce_future.failure(exception)
def maybe_expire(self, request_timeout_ms, retry_backoff_ms, linger_ms, is_full):
"""Expire batches if metadata is not available
A batch whose metadata is not available should be expired if one
of the following is true:
* the batch is not in retry AND request timeout has elapsed after
it is ready (full or linger.ms has reached).
* the batch is in retry AND request timeout has elapsed after the
backoff period ended.
"""
now = time.time()
since_append = now - self.last_append
since_ready = now - (self.created + linger_ms / 1000.0)
since_backoff = now - (self.last_attempt + retry_backoff_ms / 1000.0)
timeout = request_timeout_ms / 1000.0
error = None
if not self.in_retry() and is_full and timeout < since_append:
error = "%d seconds have passed since last append" % (since_append,)
elif not self.in_retry() and timeout < since_ready:
error = "%d seconds have passed since batch creation plus linger time" % (since_ready,)
elif self.in_retry() and timeout < since_backoff:
error = "%d seconds have passed since last attempt plus backoff time" % (since_backoff,)
if error:
self.records.close()
self.done(-1, None, Errors.KafkaTimeoutError(
"Batch for %s containing %s record(s) expired: %s" % (
self.topic_partition, self.records.next_offset(), error)))
return True
return False
def has_reached_delivery_timeout(self, delivery_timeout_ms, now=None):
now = time.time() if now is None else now
return delivery_timeout_ms / 1000 <= now - self.created
def in_retry(self):
return self._retry
def set_retry(self):
def retry(self, now=None):
now = time.time() if now is None else now
self._retry = True
self.attempts += 1
self.last_attempt = now
self.last_append = now
def buffer(self):
return self._buffer
@property
def is_done(self):
return self.produce_future.is_done
def __str__(self):
return 'ProducerBatch(topic_partition=%s, record_count=%d)' % (
@@ -143,12 +194,6 @@ class RecordAccumulator(object):
A small batch size will make batching less common and may reduce
throughput (a batch size of zero will disable batching entirely).
Default: 16384
buffer_memory (int): The total bytes of memory the producer should use
to buffer records waiting to be sent to the server. If records are
sent faster than they can be delivered to the server the producer
will block up to max_block_ms, raising an exception on timeout.
In the current implementation, this setting is an approximation.
Default: 33554432 (32MB)
compression_attrs (int): The compression type for all data generated by
the producer. Valid values are gzip(1), snappy(2), lz4(3), or
none(0).
@@ -156,7 +201,7 @@ class RecordAccumulator(object):
will also impact the compression ratio (more batching means better
compression). Default: None.
linger_ms (int): An artificial delay time to add before declaring a
messageset (that isn't full) ready for sending. This allows
record batch (that isn't full) ready for sending. This allows
time for more records to arrive. Setting a non-zero linger_ms
will trade off some latency for potentially better throughput
due to more batching (and hence fewer, larger requests).
@@ -166,14 +211,14 @@ class RecordAccumulator(object):
all retries in a short period of time. Default: 100
"""
DEFAULT_CONFIG = {
'buffer_memory': 33554432,
'batch_size': 16384,
'compression_attrs': 0,
'linger_ms': 0,
'request_timeout_ms': 30000,
'delivery_timeout_ms': 120000,
'retry_backoff_ms': 100,
'message_version': 0,
'metrics': None,
'metric_group_prefix': 'producer-metrics',
'transaction_manager': None,
'message_version': 2,
}
def __init__(self, **configs):
@@ -183,22 +228,37 @@ class RecordAccumulator(object):
self.config[key] = configs.pop(key)
self._closed = False
self._transaction_manager = self.config['transaction_manager']
self._flushes_in_progress = AtomicInteger()
self._appends_in_progress = AtomicInteger()
self._batches = collections.defaultdict(collections.deque) # TopicPartition: [ProducerBatch]
self._tp_locks = {None: threading.Lock()} # TopicPartition: Lock, plus a lock to add entries
self._free = SimpleBufferPool(self.config['buffer_memory'],
self.config['batch_size'],
metrics=self.config['metrics'],
metric_group_prefix=self.config['metric_group_prefix'])
self._incomplete = IncompleteProducerBatches()
# The following variables should only be accessed by the sender thread,
# so we don't need to protect them w/ locking.
self.muted = set()
self._drain_index = 0
self._next_batch_expiry_time_ms = float('inf')
def append(self, tp, timestamp_ms, key, value, headers, max_time_to_block_ms,
estimated_size=0):
if self.config['delivery_timeout_ms'] < self.config['linger_ms'] + self.config['request_timeout_ms']:
raise Errors.KafkaConfigurationError("Must set delivery_timeout_ms higher than linger_ms + request_timeout_ms")
@property
def delivery_timeout_ms(self):
return self.config['delivery_timeout_ms']
@property
def next_expiry_time_ms(self):
return self._next_batch_expiry_time_ms
def _tp_lock(self, tp):
if tp not in self._tp_locks:
with self._tp_locks[None]:
if tp not in self._tp_locks:
self._tp_locks[tp] = threading.Lock()
return self._tp_locks[tp]
def append(self, tp, timestamp_ms, key, value, headers, now=None):
"""Add a record to the accumulator, return the append result.
The append result will contain the future metadata, and flag for
@@ -211,59 +271,53 @@ class RecordAccumulator(object):
key (bytes): The key for the record
value (bytes): The value for the record
headers (List[Tuple[str, bytes]]): The header fields for the record
max_time_to_block_ms (int): The maximum time in milliseconds to
block for buffer memory to be available
Returns:
tuple: (future, batch_is_full, new_batch_created)
"""
assert isinstance(tp, TopicPartition), 'not TopicPartition'
assert not self._closed, 'RecordAccumulator is closed'
now = time.time() if now is None else now
# We keep track of the number of appending thread to make sure we do
# not miss batches in abortIncompleteBatches().
self._appends_in_progress.increment()
try:
if tp not in self._tp_locks:
with self._tp_locks[None]:
if tp not in self._tp_locks:
self._tp_locks[tp] = threading.Lock()
with self._tp_locks[tp]:
with self._tp_lock(tp):
# check if we have an in-progress batch
dq = self._batches[tp]
if dq:
last = dq[-1]
future = last.try_append(timestamp_ms, key, value, headers)
future = last.try_append(timestamp_ms, key, value, headers, now=now)
if future is not None:
batch_is_full = len(dq) > 1 or last.records.is_full()
return future, batch_is_full, False
size = max(self.config['batch_size'], estimated_size)
log.debug("Allocating a new %d byte message buffer for %s", size, tp) # trace
buf = self._free.allocate(size, max_time_to_block_ms)
with self._tp_locks[tp]:
with self._tp_lock(tp):
# Need to check if producer is closed again after grabbing the
# dequeue lock.
assert not self._closed, 'RecordAccumulator is closed'
if dq:
last = dq[-1]
future = last.try_append(timestamp_ms, key, value, headers)
future = last.try_append(timestamp_ms, key, value, headers, now=now)
if future is not None:
# Somebody else found us a batch, return the one we
# waited for! Hopefully this doesn't happen often...
self._free.deallocate(buf)
batch_is_full = len(dq) > 1 or last.records.is_full()
return future, batch_is_full, False
if self._transaction_manager and self.config['message_version'] < 2:
raise Errors.UnsupportedVersionError("Attempting to use idempotence with a broker which"
" does not support the required message format (v2)."
" The broker must be version 0.11 or later.")
records = MemoryRecordsBuilder(
self.config['message_version'],
self.config['compression_attrs'],
self.config['batch_size']
)
batch = ProducerBatch(tp, records, buf)
future = batch.try_append(timestamp_ms, key, value, headers)
batch = ProducerBatch(tp, records, now=now)
future = batch.try_append(timestamp_ms, key, value, headers, now=now)
if not future:
raise Exception()
@@ -274,79 +328,43 @@ class RecordAccumulator(object):
finally:
self._appends_in_progress.decrement()
def abort_expired_batches(self, request_timeout_ms, cluster):
"""Abort the batches that have been sitting in RecordAccumulator for
more than the configured request_timeout due to metadata being
unavailable.
def reset_next_batch_expiry_time(self):
self._next_batch_expiry_time_ms = float('inf')
Arguments:
request_timeout_ms (int): milliseconds to timeout
cluster (ClusterMetadata): current metadata for kafka cluster
def maybe_update_next_batch_expiry_time(self, batch):
self._next_batch_expiry_time_ms = min(self._next_batch_expiry_time_ms, batch.created * 1000 + self.delivery_timeout_ms)
Returns:
list of ProducerBatch that were expired
"""
def expired_batches(self, now=None):
"""Get a list of batches which have been sitting in the accumulator too long and need to be expired."""
expired_batches = []
to_remove = []
count = 0
for tp in list(self._batches.keys()):
assert tp in self._tp_locks, 'TopicPartition not in locks dict'
# We only check if the batch should be expired if the partition
# does not have a batch in flight. This is to avoid the later
# batches get expired when an earlier batch is still in progress.
# This protection only takes effect when user sets
# max.in.flight.request.per.connection=1. Otherwise the expiration
# order is not guranteed.
if tp in self.muted:
continue
with self._tp_locks[tp]:
with self._tp_lock(tp):
# iterate over the batches and expire them if they have stayed
# in accumulator for more than request_timeout_ms
dq = self._batches[tp]
for batch in dq:
is_full = bool(bool(batch != dq[-1]) or batch.records.is_full())
# check if the batch is expired
if batch.maybe_expire(request_timeout_ms,
self.config['retry_backoff_ms'],
self.config['linger_ms'],
is_full):
while dq:
batch = dq[0]
if batch.has_reached_delivery_timeout(self.delivery_timeout_ms, now=now):
dq.popleft()
batch.records.close()
expired_batches.append(batch)
to_remove.append(batch)
count += 1
self.deallocate(batch)
else:
# Stop at the first batch that has not expired.
self.maybe_update_next_batch_expiry_time(batch)
break
# Python does not allow us to mutate the dq during iteration
# Assuming expired batches are infrequent, this is better than
# creating a new copy of the deque for iteration on every loop
if to_remove:
for batch in to_remove:
dq.remove(batch)
to_remove = []
if expired_batches:
log.warning("Expired %d batches in accumulator", count) # trace
return expired_batches
def reenqueue(self, batch):
"""Re-enqueue the given record batch in the accumulator to retry."""
now = time.time()
batch.attempts += 1
batch.last_attempt = now
batch.last_append = now
batch.set_retry()
assert batch.topic_partition in self._tp_locks, 'TopicPartition not in locks dict'
assert batch.topic_partition in self._batches, 'TopicPartition not in batches'
dq = self._batches[batch.topic_partition]
with self._tp_locks[batch.topic_partition]:
def reenqueue(self, batch, now=None):
"""
Re-enqueue the given record batch in the accumulator. In Sender._complete_batch method, we check
whether the batch has reached delivery_timeout_ms or not. Hence we do not do the delivery timeout check here.
"""
batch.retry(now=now)
with self._tp_lock(batch.topic_partition):
dq = self._batches[batch.topic_partition]
dq.appendleft(batch)
def ready(self, cluster):
def ready(self, cluster, now=None):
"""
Get a list of nodes whose partitions are ready to be sent, and the
earliest time at which any non-sendable partition will be ready;
@@ -380,9 +398,8 @@ class RecordAccumulator(object):
ready_nodes = set()
next_ready_check = 9999999.99
unknown_leaders_exist = False
now = time.time()
now = time.time() if now is None else now
exhausted = bool(self._free.queued() > 0)
# several threads are accessing self._batches -- to simplify
# concurrent access, we iterate over a snapshot of partitions
# and lock each partition separately as needed
@@ -397,23 +414,23 @@ class RecordAccumulator(object):
elif tp in self.muted:
continue
with self._tp_locks[tp]:
with self._tp_lock(tp):
dq = self._batches[tp]
if not dq:
continue
batch = dq[0]
retry_backoff = self.config['retry_backoff_ms'] / 1000.0
linger = self.config['linger_ms'] / 1000.0
backing_off = bool(batch.attempts > 0 and
batch.last_attempt + retry_backoff > now)
retry_backoff = self.config['retry_backoff_ms'] / 1000
linger = self.config['linger_ms'] / 1000
backing_off = bool(batch.attempts > 0
and (batch.last_attempt + retry_backoff) > now)
waited_time = now - batch.last_attempt
time_to_wait = retry_backoff if backing_off else linger
time_left = max(time_to_wait - waited_time, 0)
full = bool(len(dq) > 1 or batch.records.is_full())
expired = bool(waited_time >= time_to_wait)
sendable = (full or expired or exhausted or self._closed or
self._flush_in_progress())
sendable = (full or expired or self._closed or
self.flush_in_progress())
if sendable and not backing_off:
ready_nodes.add(leader)
@@ -427,16 +444,98 @@ class RecordAccumulator(object):
return ready_nodes, next_ready_check, unknown_leaders_exist
def has_unsent(self):
"""Return whether there is any unsent record in the accumulator."""
def has_undrained(self):
"""Check whether there are any batches which haven't been drained"""
for tp in list(self._batches.keys()):
with self._tp_locks[tp]:
with self._tp_lock(tp):
dq = self._batches[tp]
if len(dq):
return True
return False
def drain(self, cluster, nodes, max_size):
def _should_stop_drain_batches_for_partition(self, first, tp):
if self._transaction_manager:
if not self._transaction_manager.is_send_to_partition_allowed(tp):
return True
if not self._transaction_manager.producer_id_and_epoch.is_valid:
# we cannot send the batch until we have refreshed the PID
log.debug("Waiting to send ready batches because transaction producer id is not valid")
return True
return False
def drain_batches_for_one_node(self, cluster, node_id, max_size, now=None):
now = time.time() if now is None else now
size = 0
ready = []
partitions = list(cluster.partitions_for_broker(node_id))
if not partitions:
return ready
# to make starvation less likely this loop doesn't start at 0
self._drain_index %= len(partitions)
start = None
while start != self._drain_index:
tp = partitions[self._drain_index]
if start is None:
start = self._drain_index
self._drain_index += 1
self._drain_index %= len(partitions)
# Only proceed if the partition has no in-flight batches.
if tp in self.muted:
continue
if tp not in self._batches:
continue
with self._tp_lock(tp):
dq = self._batches[tp]
if len(dq) == 0:
continue
first = dq[0]
backoff = bool(first.attempts > 0 and
first.last_attempt + self.config['retry_backoff_ms'] / 1000 > now)
# Only drain the batch if it is not during backoff
if backoff:
continue
if (size + first.records.size_in_bytes() > max_size
and len(ready) > 0):
# there is a rare case that a single batch
# size is larger than the request size due
# to compression; in this case we will
# still eventually send this batch in a
# single request
break
else:
if self._should_stop_drain_batches_for_partition(first, tp):
break
batch = dq.popleft()
if self._transaction_manager and not batch.in_retry():
# If the batch is in retry, then we should not change the pid and
# sequence number, since this may introduce duplicates. In particular,
# the previous attempt may actually have been accepted, and if we change
# the pid and sequence here, this attempt will also be accepted, causing
# a duplicate.
sequence_number = self._transaction_manager.sequence_number(batch.topic_partition)
log.debug("Dest: %s: %s producer_id=%s epoch=%s sequence=%s",
node_id, batch.topic_partition,
self._transaction_manager.producer_id_and_epoch.producer_id,
self._transaction_manager.producer_id_and_epoch.epoch,
sequence_number)
batch.records.set_producer_state(
self._transaction_manager.producer_id_and_epoch.producer_id,
self._transaction_manager.producer_id_and_epoch.epoch,
sequence_number,
self._transaction_manager.is_transactional()
)
batch.records.close()
size += batch.records.size_in_bytes()
ready.append(batch)
batch.drained = now
return ready
def drain(self, cluster, nodes, max_size, now=None):
"""
Drain all the data for the given nodes and collate them into a list of
batches that will fit within the specified size on a per-node basis.
@@ -454,59 +553,17 @@ class RecordAccumulator(object):
if not nodes:
return {}
now = time.time()
now = time.time() if now is None else now
batches = {}
for node_id in nodes:
size = 0
partitions = list(cluster.partitions_for_broker(node_id))
ready = []
# to make starvation less likely this loop doesn't start at 0
self._drain_index %= len(partitions)
start = self._drain_index
while True:
tp = partitions[self._drain_index]
if tp in self._batches and tp not in self.muted:
with self._tp_locks[tp]:
dq = self._batches[tp]
if dq:
first = dq[0]
backoff = (
bool(first.attempts > 0) and
bool(first.last_attempt +
self.config['retry_backoff_ms'] / 1000.0
> now)
)
# Only drain the batch if it is not during backoff
if not backoff:
if (size + first.records.size_in_bytes() > max_size
and len(ready) > 0):
# there is a rare case that a single batch
# size is larger than the request size due
# to compression; in this case we will
# still eventually send this batch in a
# single request
break
else:
batch = dq.popleft()
batch.records.close()
size += batch.records.size_in_bytes()
ready.append(batch)
batch.drained = now
self._drain_index += 1
self._drain_index %= len(partitions)
if start == self._drain_index:
break
batches[node_id] = ready
batches[node_id] = self.drain_batches_for_one_node(cluster, node_id, max_size, now=now)
return batches
def deallocate(self, batch):
"""Deallocate the record batch."""
self._incomplete.remove(batch)
self._free.deallocate(batch.buffer())
def _flush_in_progress(self):
def flush_in_progress(self):
"""Are there any threads currently waiting on a flush?"""
return self._flushes_in_progress.get() > 0
@@ -535,6 +592,10 @@ class RecordAccumulator(object):
finally:
self._flushes_in_progress.decrement()
@property
def has_incomplete(self):
return bool(self._incomplete)
def abort_incomplete_batches(self):
"""
This function is only called when sender is closed forcefully. It will fail all the
@@ -544,27 +605,41 @@ class RecordAccumulator(object):
# 1. Avoid losing batches.
# 2. Free up memory in case appending threads are blocked on buffer full.
# This is a tight loop but should be able to get through very quickly.
error = Errors.IllegalStateError("Producer is closed forcefully.")
while True:
self._abort_batches()
self._abort_batches(error)
if not self._appends_in_progress.get():
break
# After this point, no thread will append any messages because they will see the close
# flag set. We need to do the last abort after no thread was appending in case the there was a new
# batch appended by the last appending thread.
self._abort_batches()
self._abort_batches(error)
self._batches.clear()
def _abort_batches(self):
def _abort_batches(self, error):
"""Go through incomplete batches and abort them."""
error = Errors.IllegalStateError("Producer is closed forcefully.")
for batch in self._incomplete.all():
tp = batch.topic_partition
# Close the batch before aborting
with self._tp_locks[tp]:
with self._tp_lock(tp):
batch.records.close()
batch.done(exception=error)
self._batches[tp].remove(batch)
batch.abort(error)
self.deallocate(batch)
def abort_undrained_batches(self, error):
for batch in self._incomplete.all():
tp = batch.topic_partition
with self._tp_lock(tp):
aborted = False
if not batch.is_done:
aborted = True
batch.records.close()
self._batches[tp].remove(batch)
if aborted:
batch.abort(error)
self.deallocate(batch)
def close(self):
"""Close this accumulator and force all the record buffers to be drained."""
self._closed = True
@@ -579,12 +654,21 @@ class IncompleteProducerBatches(object):
def add(self, batch):
with self._lock:
return self._incomplete.add(batch)
self._incomplete.add(batch)
def remove(self, batch):
with self._lock:
return self._incomplete.remove(batch)
try:
self._incomplete.remove(batch)
except KeyError:
pass
def all(self):
with self._lock:
return list(self._incomplete)
def __bool__(self):
return bool(self._incomplete)
__nonzero__ = __bool__

View File

@@ -2,6 +2,7 @@ from __future__ import absolute_import, division
import collections
import copy
import heapq
import logging
import threading
import time
@@ -11,6 +12,8 @@ from kafka.vendor import six
from kafka import errors as Errors
from kafka.metrics.measurable import AnonMeasurable
from kafka.metrics.stats import Avg, Max, Rate
from kafka.producer.transaction_manager import ProducerIdAndEpoch
from kafka.protocol.init_producer_id import InitProducerIdRequest
from kafka.protocol.produce import ProduceRequest
from kafka.structs import TopicPartition
from kafka.version import __version__
@@ -27,14 +30,18 @@ class Sender(threading.Thread):
DEFAULT_CONFIG = {
'max_request_size': 1048576,
'acks': 1,
'retries': 0,
'retries': float('inf'),
'request_timeout_ms': 30000,
'retry_backoff_ms': 100,
'metrics': None,
'guarantee_message_order': False,
'transaction_manager': None,
'transactional_id': None,
'transaction_timeout_ms': 60000,
'client_id': 'kafka-python-' + __version__,
'api_version': (0, 8, 0),
}
def __init__(self, client, metadata, accumulator, metrics, **configs):
def __init__(self, client, metadata, accumulator, **configs):
super(Sender, self).__init__()
self.config = copy.copy(self.DEFAULT_CONFIG)
for key in self.config:
@@ -48,32 +55,75 @@ class Sender(threading.Thread):
self._running = True
self._force_close = False
self._topics_to_add = set()
self._sensors = SenderMetrics(metrics, self._client, self._metadata)
if self.config['metrics']:
self._sensors = SenderMetrics(self.config['metrics'], self._client, self._metadata)
else:
self._sensors = None
self._transaction_manager = self.config['transaction_manager']
# A per-partition queue of batches ordered by creation time for tracking the in-flight batches
self._in_flight_batches = collections.defaultdict(list)
def _maybe_remove_from_inflight_batches(self, batch):
try:
queue = self._in_flight_batches[batch.topic_partition]
except KeyError:
return
try:
idx = queue.index((batch.created, batch))
except ValueError:
return
# https://stackoverflow.com/questions/10162679/python-delete-element-from-heap
queue[idx] = queue[-1]
queue.pop()
heapq.heapify(queue)
def _get_expired_inflight_batches(self, now=None):
"""Get the in-flight batches that has reached delivery timeout."""
expired_batches = []
to_remove = []
for tp, queue in six.iteritems(self._in_flight_batches):
while queue:
_created_at, batch = queue[0]
if batch.has_reached_delivery_timeout(self._accumulator.delivery_timeout_ms):
heapq.heappop(queue)
if batch.final_state is None:
expired_batches.append(batch)
else:
raise Errors.IllegalStateError("%s batch created at %s gets unexpected final state %s" % (batch.topic_partition, batch.created, batch.final_state))
else:
self._accumulator.maybe_update_next_batch_expiry_time(batch)
break
else:
# Avoid mutating in_flight_batches during iteration
to_remove.append(tp)
for tp in to_remove:
del self._in_flight_batches[tp]
return expired_batches
def run(self):
"""The main run loop for the sender thread."""
log.debug("Starting Kafka producer I/O thread.")
log.debug("%s: Starting Kafka producer I/O thread.", str(self))
# main loop, runs until close is called
while self._running:
try:
self.run_once()
except Exception:
log.exception("Uncaught error in kafka producer I/O thread")
log.exception("%s: Uncaught error in kafka producer I/O thread", str(self))
log.debug("Beginning shutdown of Kafka producer I/O thread, sending"
" remaining records.")
log.debug("%s: Beginning shutdown of Kafka producer I/O thread, sending"
" remaining records.", str(self))
# okay we stopped accepting requests but there may still be
# requests in the accumulator or waiting for acknowledgment,
# wait until these are completed.
while (not self._force_close
and (self._accumulator.has_unsent()
and (self._accumulator.has_undrained()
or self._client.in_flight_request_count() > 0)):
try:
self.run_once()
except Exception:
log.exception("Uncaught error in kafka producer I/O thread")
log.exception("%s: Uncaught error in kafka producer I/O thread", str(self))
if self._force_close:
# We need to fail all the incomplete batches and wake up the
@@ -83,38 +133,75 @@ class Sender(threading.Thread):
try:
self._client.close()
except Exception:
log.exception("Failed to close network client")
log.exception("%s: Failed to close network client", str(self))
log.debug("Shutdown of Kafka producer I/O thread has completed.")
log.debug("%s: Shutdown of Kafka producer I/O thread has completed.", str(self))
def run_once(self):
"""Run a single iteration of sending."""
while self._topics_to_add:
self._client.add_topic(self._topics_to_add.pop())
if self._transaction_manager:
try:
if not self._transaction_manager.is_transactional():
# this is an idempotent producer, so make sure we have a producer id
self._maybe_wait_for_producer_id()
elif self._transaction_manager.has_in_flight_transactional_request() or self._maybe_send_transactional_request():
# as long as there are outstanding transactional requests, we simply wait for them to return
self._client.poll(timeout_ms=self.config['retry_backoff_ms'])
return
# do not continue sending if the transaction manager is in a failed state or if there
# is no producer id (for the idempotent case).
if self._transaction_manager.has_fatal_error() or not self._transaction_manager.has_producer_id():
last_error = self._transaction_manager.last_error
if last_error is not None:
self._maybe_abort_batches(last_error)
self._client.poll(timeout_ms=self.config['retry_backoff_ms'])
return
elif self._transaction_manager.has_abortable_error():
self._accumulator.abort_undrained_batches(self._transaction_manager.last_error)
except Errors.SaslAuthenticationFailedError as e:
# This is already logged as error, but propagated here to perform any clean ups.
log.debug("%s: Authentication exception while processing transactional request: %s", str(self), e)
self._transaction_manager.authentication_failed(e)
poll_timeout_ms = self._send_producer_data()
self._client.poll(timeout_ms=poll_timeout_ms)
def _send_producer_data(self, now=None):
now = time.time() if now is None else now
# get the list of partitions with data ready to send
result = self._accumulator.ready(self._metadata)
result = self._accumulator.ready(self._metadata, now=now)
ready_nodes, next_ready_check_delay, unknown_leaders_exist = result
# if there are any partitions whose leaders are not known yet, force
# metadata update
if unknown_leaders_exist:
log.debug('Unknown leaders exist, requesting metadata update')
log.debug('%s: Unknown leaders exist, requesting metadata update', str(self))
self._metadata.request_update()
# remove any nodes we aren't ready to send to
not_ready_timeout = float('inf')
not_ready_timeout_ms = float('inf')
for node in list(ready_nodes):
if not self._client.is_ready(node):
log.debug('Node %s not ready; delaying produce of accumulated batch', node)
node_delay_ms = self._client.connection_delay(node)
log.debug('%s: Node %s not ready; delaying produce of accumulated batch (%f ms)', str(self), node, node_delay_ms)
self._client.maybe_connect(node, wakeup=False)
ready_nodes.remove(node)
not_ready_timeout = min(not_ready_timeout,
self._client.connection_delay(node))
not_ready_timeout_ms = min(not_ready_timeout_ms, node_delay_ms)
# create produce requests
batches_by_node = self._accumulator.drain(
self._metadata, ready_nodes, self.config['max_request_size'])
self._metadata, ready_nodes, self.config['max_request_size'], now=now)
for batch_list in six.itervalues(batches_by_node):
for batch in batch_list:
item = (batch.created, batch)
queue = self._in_flight_batches[batch.topic_partition]
heapq.heappush(queue, item)
if self.config['guarantee_message_order']:
# Mute all the partitions drained
@@ -122,42 +209,130 @@ class Sender(threading.Thread):
for batch in batch_list:
self._accumulator.muted.add(batch.topic_partition)
expired_batches = self._accumulator.abort_expired_batches(
self.config['request_timeout_ms'], self._metadata)
for expired_batch in expired_batches:
self._sensors.record_errors(expired_batch.topic_partition.topic, expired_batch.record_count)
self._accumulator.reset_next_batch_expiry_time()
expired_batches = self._accumulator.expired_batches(now=now)
expired_batches.extend(self._get_expired_inflight_batches(now=now))
if expired_batches:
log.debug("%s: Expired %s batches in accumulator", str(self), len(expired_batches))
# Reset the producer_id if an expired batch has previously been sent to the broker.
# See the documentation of `TransactionState.reset_producer_id` to understand why
# we need to reset the producer id here.
if self._transaction_manager and any([batch.in_retry() for batch in expired_batches]):
needs_transaction_state_reset = True
else:
needs_transaction_state_reset = False
for expired_batch in expired_batches:
error = Errors.KafkaTimeoutError(
"Expiring %d record(s) for %s: %s ms has passed since batch creation" % (
expired_batch.record_count, expired_batch.topic_partition,
int((time.time() - expired_batch.created) * 1000)))
self._fail_batch(expired_batch, error, base_offset=-1)
if self._sensors:
self._sensors.update_produce_request_metrics(batches_by_node)
if needs_transaction_state_reset:
self._transaction_manager.reset_producer_id()
return 0
self._sensors.update_produce_request_metrics(batches_by_node)
requests = self._create_produce_requests(batches_by_node)
# If we have any nodes that are ready to send + have sendable data,
# poll with 0 timeout so this can immediately loop and try sending more
# data. Otherwise, the timeout is determined by nodes that have
# partitions with data that isn't yet sendable (e.g. lingering, backing
# off). Note that this specifically does not include nodes with
# data. Otherwise, the timeout will be the smaller value between next
# batch expiry time, and the delay time for checking data availability.
# Note that the nodes may have data that isn't yet sendable due to
# lingering, backing off, etc. This specifically does not include nodes with
# sendable data that aren't ready to send since they would cause busy
# looping.
poll_timeout_ms = min(next_ready_check_delay * 1000, not_ready_timeout)
poll_timeout_ms = min(next_ready_check_delay * 1000,
not_ready_timeout_ms,
self._accumulator.next_expiry_time_ms - now * 1000)
if poll_timeout_ms < 0:
poll_timeout_ms = 0
if ready_nodes:
log.debug("Nodes with data ready to send: %s", ready_nodes) # trace
log.debug("Created %d produce requests: %s", len(requests), requests) # trace
log.debug("%s: Nodes with data ready to send: %s", str(self), ready_nodes) # trace
log.debug("%s: Created %d produce requests: %s", str(self), len(requests), requests) # trace
# if some partitions are already ready to be sent, the select time
# would be 0; otherwise if some partition already has some data
# accumulated but not ready yet, the select time will be the time
# difference between now and its linger expiry time; otherwise the
# select time will be the time difference between now and the
# metadata expiry time
poll_timeout_ms = 0
for node_id, request in six.iteritems(requests):
batches = batches_by_node[node_id]
log.debug('Sending Produce Request: %r', request)
log.debug('%s: Sending Produce Request: %r', str(self), request)
(self._client.send(node_id, request, wakeup=False)
.add_callback(
self._handle_produce_response, node_id, time.time(), batches)
.add_errback(
self._failed_produce, batches, node_id))
return poll_timeout_ms
# if some partitions are already ready to be sent, the select time
# would be 0; otherwise if some partition already has some data
# accumulated but not ready yet, the select time will be the time
# difference between now and its linger expiry time; otherwise the
# select time will be the time difference between now and the
# metadata expiry time
self._client.poll(timeout_ms=poll_timeout_ms)
def _maybe_send_transactional_request(self):
if self._transaction_manager.is_completing() and self._accumulator.has_incomplete:
if self._transaction_manager.is_aborting():
self._accumulator.abort_undrained_batches(Errors.KafkaError("Failing batch since transaction was aborted"))
# There may still be requests left which are being retried. Since we do not know whether they had
# been successfully appended to the broker log, we must resend them until their final status is clear.
# If they had been appended and we did not receive the error, then our sequence number would no longer
# be correct which would lead to an OutOfSequenceNumberError.
if not self._accumulator.flush_in_progress():
self._accumulator.begin_flush()
next_request_handler = self._transaction_manager.next_request_handler(self._accumulator.has_incomplete)
if next_request_handler is None:
return False
log.debug("%s: Sending transactional request %s", str(self), next_request_handler.request)
while not self._force_close:
target_node = None
try:
if next_request_handler.needs_coordinator():
target_node = self._transaction_manager.coordinator(next_request_handler.coordinator_type)
if target_node is None:
self._transaction_manager.lookup_coordinator_for_request(next_request_handler)
break
elif not self._client.await_ready(target_node, timeout_ms=self.config['request_timeout_ms']):
self._transaction_manager.lookup_coordinator_for_request(next_request_handler)
target_node = None
break
else:
target_node = self._client.least_loaded_node()
if target_node is not None and not self._client.await_ready(target_node, timeout_ms=self.config['request_timeout_ms']):
target_node = None
if target_node is not None:
if next_request_handler.is_retry:
time.sleep(self.config['retry_backoff_ms'] / 1000)
txn_correlation_id = self._transaction_manager.next_in_flight_request_correlation_id()
future = self._client.send(target_node, next_request_handler.request)
future.add_both(next_request_handler.on_complete, txn_correlation_id)
return True
except Exception as e:
log.warn("%s: Got an exception when trying to find a node to send a transactional request to. Going to back off and retry: %s", str(self), e)
if next_request_handler.needs_coordinator():
self._transaction_manager.lookup_coordinator_for_request(next_request_handler)
break
time.sleep(self.config['retry_backoff_ms'] / 1000)
self._metadata.request_update()
if target_node is None:
self._transaction_manager.retry(next_request_handler)
return True
def _maybe_abort_batches(self, exc):
if self._accumulator.has_incomplete:
log.error("%s: Aborting producer batches due to fatal error: %s", str(self), exc)
self._accumulator.abort_batches(exc)
def initiate_close(self):
"""Start closing the sender (won't complete until all data is sent)."""
@@ -180,82 +355,164 @@ class Sender(threading.Thread):
self._topics_to_add.add(topic)
self.wakeup()
def _maybe_wait_for_producer_id(self):
while not self._transaction_manager.has_producer_id():
try:
node_id = self._client.least_loaded_node()
if node_id is None or not self._client.await_ready(node_id):
log.debug("%s, Could not find an available broker to send InitProducerIdRequest to." +
" Will back off and try again.", str(self))
time.sleep(self._client.least_loaded_node_refresh_ms() / 1000)
continue
version = self._client.api_version(InitProducerIdRequest, max_version=1)
request = InitProducerIdRequest[version](
transactional_id=self.config['transactional_id'],
transaction_timeout_ms=self.config['transaction_timeout_ms'],
)
response = self._client.send_and_receive(node_id, request)
error_type = Errors.for_code(response.error_code)
if error_type is Errors.NoError:
self._transaction_manager.set_producer_id_and_epoch(ProducerIdAndEpoch(response.producer_id, response.producer_epoch))
break
elif getattr(error_type, 'retriable', False):
log.debug("%s: Retriable error from InitProducerId response: %s", str(self), error_type.__name__)
if getattr(error_type, 'invalid_metadata', False):
self._metadata.request_update()
else:
self._transaction_manager.transition_to_fatal_error(error_type())
break
except Errors.KafkaConnectionError:
log.debug("%s: Broker %s disconnected while awaiting InitProducerId response", str(self), node_id)
except Errors.RequestTimedOutError:
log.debug("%s: InitProducerId request to node %s timed out", str(self), node_id)
log.debug("%s: Retry InitProducerIdRequest in %sms.", str(self), self.config['retry_backoff_ms'])
time.sleep(self.config['retry_backoff_ms'] / 1000)
def _failed_produce(self, batches, node_id, error):
log.debug("Error sending produce request to node %d: %s", node_id, error) # trace
log.error("%s: Error sending produce request to node %d: %s", str(self), node_id, error) # trace
for batch in batches:
self._complete_batch(batch, error, -1, None)
self._complete_batch(batch, error, -1)
def _handle_produce_response(self, node_id, send_time, batches, response):
"""Handle a produce response."""
# if we have a response, parse it
log.debug('Parsing produce response: %r', response)
log.debug('%s: Parsing produce response: %r', str(self), response)
if response:
batches_by_partition = dict([(batch.topic_partition, batch)
for batch in batches])
for topic, partitions in response.topics:
for partition_info in partitions:
global_error = None
log_start_offset = None
if response.API_VERSION < 2:
partition, error_code, offset = partition_info
ts = None
elif 2 <= response.API_VERSION <= 4:
partition, error_code, offset, ts = partition_info
elif 5 <= response.API_VERSION <= 7:
partition, error_code, offset, ts, log_start_offset = partition_info
partition, error_code, offset, ts, _log_start_offset = partition_info
else:
# the ignored parameter is record_error of type list[(batch_index: int, error_message: str)]
partition, error_code, offset, ts, log_start_offset, _, global_error = partition_info
# Currently unused / TODO: KIP-467
partition, error_code, offset, ts, _log_start_offset, _record_errors, _global_error = partition_info
tp = TopicPartition(topic, partition)
error = Errors.for_code(error_code)
batch = batches_by_partition[tp]
self._complete_batch(batch, error, offset, ts, log_start_offset, global_error)
if response.API_VERSION > 0:
self._sensors.record_throttle_time(response.throttle_time_ms, node=node_id)
self._complete_batch(batch, error, offset, timestamp_ms=ts)
else:
# this is the acks = 0 case, just complete all requests
for batch in batches:
self._complete_batch(batch, None, -1, None)
self._complete_batch(batch, None, -1)
def _complete_batch(self, batch, error, base_offset, timestamp_ms=None, log_start_offset=None, global_error=None):
def _fail_batch(self, batch, exception, base_offset=None, timestamp_ms=None):
exception = exception if type(exception) is not type else exception()
if self._transaction_manager:
if isinstance(exception, Errors.OutOfOrderSequenceNumberError) and \
not self._transaction_manager.is_transactional() and \
self._transaction_manager.has_producer_id(batch.producer_id):
log.error("%s: The broker received an out of order sequence number for topic-partition %s"
" at offset %s. This indicates data loss on the broker, and should be investigated.",
str(self), batch.topic_partition, base_offset)
# Reset the transaction state since we have hit an irrecoverable exception and cannot make any guarantees
# about the previously committed message. Note that this will discard the producer id and sequence
# numbers for all existing partitions.
self._transaction_manager.reset_producer_id()
elif isinstance(exception, (Errors.ClusterAuthorizationFailedError,
Errors.TransactionalIdAuthorizationFailedError,
Errors.ProducerFencedError,
Errors.InvalidTxnStateError)):
self._transaction_manager.transition_to_fatal_error(exception)
elif self._transaction_manager.is_transactional():
self._transaction_manager.transition_to_abortable_error(exception)
if self._sensors:
self._sensors.record_errors(batch.topic_partition.topic, batch.record_count)
if batch.done(base_offset=base_offset, timestamp_ms=timestamp_ms, exception=exception):
self._maybe_remove_from_inflight_batches(batch)
self._accumulator.deallocate(batch)
def _complete_batch(self, batch, error, base_offset, timestamp_ms=None):
"""Complete or retry the given batch of records.
Arguments:
batch (RecordBatch): The record batch
batch (ProducerBatch): The record batch
error (Exception): The error (or None if none)
base_offset (int): The base offset assigned to the records if successful
timestamp_ms (int, optional): The timestamp returned by the broker for this batch
log_start_offset (int): The start offset of the log at the time this produce response was created
global_error (str): The summarising error message
"""
# Standardize no-error to None
if error is Errors.NoError:
error = None
if error is not None and self._can_retry(batch, error):
# retry
log.warning("Got error produce response on topic-partition %s,"
" retrying (%d attempts left). Error: %s",
batch.topic_partition,
self.config['retries'] - batch.attempts - 1,
global_error or error)
self._accumulator.reenqueue(batch)
self._sensors.record_retries(batch.topic_partition.topic, batch.record_count)
if error is not None:
if self._can_retry(batch, error):
# retry
log.warning("%s: Got error produce response on topic-partition %s,"
" retrying (%s attempts left). Error: %s",
str(self), batch.topic_partition,
self.config['retries'] - batch.attempts - 1,
error)
# If idempotence is enabled only retry the request if the batch matches our current producer id and epoch
if not self._transaction_manager or self._transaction_manager.producer_id_and_epoch.match(batch):
log.debug("%s: Retrying batch to topic-partition %s. Sequence number: %s",
str(self), batch.topic_partition,
self._transaction_manager.sequence_number(batch.topic_partition) if self._transaction_manager else None)
self._accumulator.reenqueue(batch)
self._maybe_remove_from_inflight_batches(batch)
if self._sensors:
self._sensors.record_retries(batch.topic_partition.topic, batch.record_count)
else:
log.warning("%s: Attempted to retry sending a batch but the producer id/epoch changed from %s/%s to %s/%s. This batch will be dropped",
str(self), batch.producer_id, batch.producer_epoch,
self._transaction_manager.producer_id_and_epoch.producer_id,
self._transaction_manager.producer_id_and_epoch.epoch)
self._fail_batch(batch, error, base_offset=base_offset, timestamp_ms=timestamp_ms)
else:
if error is Errors.TopicAuthorizationFailedError:
error = error(batch.topic_partition.topic)
# tell the user the result of their request
self._fail_batch(batch, error, base_offset=base_offset, timestamp_ms=timestamp_ms)
if error is Errors.UnknownTopicOrPartitionError:
log.warning("%s: Received unknown topic or partition error in produce request on partition %s."
" The topic/partition may not exist or the user may not have Describe access to it",
str(self), batch.topic_partition)
if getattr(error, 'invalid_metadata', False):
self._metadata.request_update()
else:
if error is Errors.TopicAuthorizationFailedError:
error = error(batch.topic_partition.topic)
if batch.done(base_offset=base_offset, timestamp_ms=timestamp_ms):
self._maybe_remove_from_inflight_batches(batch)
self._accumulator.deallocate(batch)
# tell the user the result of their request
batch.done(base_offset, timestamp_ms, error, log_start_offset, global_error)
self._accumulator.deallocate(batch)
if error is not None:
self._sensors.record_errors(batch.topic_partition.topic, batch.record_count)
if getattr(error, 'invalid_metadata', False):
self._metadata.request_update()
if self._transaction_manager and self._transaction_manager.producer_id_and_epoch.match(batch):
self._transaction_manager.increment_sequence_number(batch.topic_partition, batch.record_count)
log.debug("%s: Incremented sequence number for topic-partition %s to %s", str(self), batch.topic_partition,
self._transaction_manager.sequence_number(batch.topic_partition))
# Unmute the completed partition.
if self.config['guarantee_message_order']:
@@ -266,8 +523,10 @@ class Sender(threading.Thread):
We can retry a send if the error is transient and the number of
attempts taken is fewer than the maximum allowed
"""
return (batch.attempts < self.config['retries']
and getattr(error, 'retriable', False))
return (not batch.has_reached_delivery_timeout(self._accumulator.delivery_timeout_ms) and
batch.attempts < self.config['retries'] and
batch.final_state is None and
getattr(error, 'retriable', False))
def _create_produce_requests(self, collated):
"""
@@ -275,23 +534,24 @@ class Sender(threading.Thread):
per-node basis.
Arguments:
collated: {node_id: [RecordBatch]}
collated: {node_id: [ProducerBatch]}
Returns:
dict: {node_id: ProduceRequest} (version depends on api_version)
dict: {node_id: ProduceRequest} (version depends on client api_versions)
"""
requests = {}
for node_id, batches in six.iteritems(collated):
requests[node_id] = self._produce_request(
node_id, self.config['acks'],
self.config['request_timeout_ms'], batches)
if batches:
requests[node_id] = self._produce_request(
node_id, self.config['acks'],
self.config['request_timeout_ms'], batches)
return requests
def _produce_request(self, node_id, acks, timeout, batches):
"""Create a produce request from the given record batches.
Returns:
ProduceRequest (version depends on api_version)
ProduceRequest (version depends on client api_versions)
"""
produce_records_by_partition = collections.defaultdict(dict)
for batch in batches:
@@ -301,32 +561,26 @@ class Sender(threading.Thread):
buf = batch.records.buffer()
produce_records_by_partition[topic][partition] = buf
kwargs = {}
if self.config['api_version'] >= (2, 1):
version = 7
elif self.config['api_version'] >= (2, 0):
version = 6
elif self.config['api_version'] >= (1, 1):
version = 5
elif self.config['api_version'] >= (1, 0):
version = 4
elif self.config['api_version'] >= (0, 11):
version = 3
kwargs = dict(transactional_id=None)
elif self.config['api_version'] >= (0, 10):
version = 2
elif self.config['api_version'] == (0, 9):
version = 1
version = self._client.api_version(ProduceRequest, max_version=7)
topic_partition_data = [
(topic, list(partition_info.items()))
for topic, partition_info in six.iteritems(produce_records_by_partition)]
transactional_id = self._transaction_manager.transactional_id if self._transaction_manager else None
if version >= 3:
return ProduceRequest[version](
transactional_id=transactional_id,
required_acks=acks,
timeout=timeout,
topics=topic_partition_data,
)
else:
version = 0
return ProduceRequest[version](
required_acks=acks,
timeout=timeout,
topics=[(topic, list(partition_info.items()))
for topic, partition_info
in six.iteritems(produce_records_by_partition)],
**kwargs
)
if transactional_id is not None:
log.warning('%s: Broker does not support ProduceRequest v3+, required for transactional_id', str(self))
return ProduceRequest[version](
required_acks=acks,
timeout=timeout,
topics=topic_partition_data,
)
def wakeup(self):
"""Wake up the selector associated with this send thread."""
@@ -335,6 +589,9 @@ class Sender(threading.Thread):
def bootstrap_connected(self):
return self._client.bootstrap_connected()
def __str__(self):
return "<Sender client_id=%s transactional_id=%s>" % (self.config['client_id'], self.config['transactional_id'])
class SenderMetrics(object):
@@ -367,15 +624,6 @@ class SenderMetrics(object):
sensor_name=sensor_name,
description='The maximum time in ms record batches spent in the record accumulator.')
sensor_name = 'produce-throttle-time'
self.produce_throttle_time_sensor = self.metrics.sensor(sensor_name)
self.add_metric('produce-throttle-time-avg', Avg(),
sensor_name=sensor_name,
description='The average throttle time in ms')
self.add_metric('produce-throttle-time-max', Max(),
sensor_name=sensor_name,
description='The maximum throttle time in ms')
sensor_name = 'records-per-request'
self.records_per_request_sensor = self.metrics.sensor(sensor_name)
self.add_metric('record-send-rate', Rate(),
@@ -498,8 +746,9 @@ class SenderMetrics(object):
records += batch.record_count
total_bytes += batch.records.size_in_bytes()
self.records_per_request_sensor.record(records)
self.byte_rate_sensor.record(total_bytes)
if node_batch:
self.records_per_request_sensor.record(records)
self.byte_rate_sensor.record(total_bytes)
def record_retries(self, topic, count):
self.retry_sensor.record(count)
@@ -512,6 +761,3 @@ class SenderMetrics(object):
sensor = self.metrics.get_sensor('topic.' + topic + '.record-errors')
if sensor:
sensor.record(count)
def record_throttle_time(self, throttle_time_ms, node=None):
self.produce_throttle_time_sensor.record(throttle_time_ms)

View File

@@ -0,0 +1,981 @@
from __future__ import absolute_import, division
import abc
import collections
import heapq
import logging
import threading
from kafka.vendor import six
try:
# enum in stdlib as of py3.4
from enum import IntEnum # pylint: disable=import-error
except ImportError:
# vendored backport module
from kafka.vendor.enum34 import IntEnum
import kafka.errors as Errors
from kafka.protocol.add_offsets_to_txn import AddOffsetsToTxnRequest
from kafka.protocol.add_partitions_to_txn import AddPartitionsToTxnRequest
from kafka.protocol.end_txn import EndTxnRequest
from kafka.protocol.find_coordinator import FindCoordinatorRequest
from kafka.protocol.init_producer_id import InitProducerIdRequest
from kafka.protocol.txn_offset_commit import TxnOffsetCommitRequest
from kafka.structs import TopicPartition
log = logging.getLogger(__name__)
NO_PRODUCER_ID = -1
NO_PRODUCER_EPOCH = -1
NO_SEQUENCE = -1
class ProducerIdAndEpoch(object):
__slots__ = ('producer_id', 'epoch')
def __init__(self, producer_id, epoch):
self.producer_id = producer_id
self.epoch = epoch
@property
def is_valid(self):
return NO_PRODUCER_ID < self.producer_id
def match(self, batch):
return self.producer_id == batch.producer_id and self.epoch == batch.producer_epoch
def __eq__(self, other):
return isinstance(other, ProducerIdAndEpoch) and self.producer_id == other.producer_id and self.epoch == other.epoch
def __str__(self):
return "ProducerIdAndEpoch(producer_id={}, epoch={})".format(self.producer_id, self.epoch)
class TransactionState(IntEnum):
UNINITIALIZED = 0
INITIALIZING = 1
READY = 2
IN_TRANSACTION = 3
COMMITTING_TRANSACTION = 4
ABORTING_TRANSACTION = 5
ABORTABLE_ERROR = 6
FATAL_ERROR = 7
@classmethod
def is_transition_valid(cls, source, target):
if target == cls.INITIALIZING:
return source == cls.UNINITIALIZED
elif target == cls.READY:
return source in (cls.INITIALIZING, cls.COMMITTING_TRANSACTION, cls.ABORTING_TRANSACTION)
elif target == cls.IN_TRANSACTION:
return source == cls.READY
elif target == cls.COMMITTING_TRANSACTION:
return source == cls.IN_TRANSACTION
elif target == cls.ABORTING_TRANSACTION:
return source in (cls.IN_TRANSACTION, cls.ABORTABLE_ERROR)
elif target == cls.ABORTABLE_ERROR:
return source in (cls.IN_TRANSACTION, cls.COMMITTING_TRANSACTION, cls.ABORTABLE_ERROR)
elif target == cls.UNINITIALIZED:
# Disallow transitions to UNITIALIZED
return False
elif target == cls.FATAL_ERROR:
# We can transition to FATAL_ERROR unconditionally.
# FATAL_ERROR is never a valid starting state for any transition. So the only option is to close the
# producer or do purely non transactional requests.
return True
class Priority(IntEnum):
# We use the priority to determine the order in which requests need to be sent out. For instance, if we have
# a pending FindCoordinator request, that must always go first. Next, If we need a producer id, that must go second.
# The endTxn request must always go last.
FIND_COORDINATOR = 0
INIT_PRODUCER_ID = 1
ADD_PARTITIONS_OR_OFFSETS = 2
END_TXN = 3
class TransactionManager(object):
"""
A class which maintains state for transactions. Also keeps the state necessary to ensure idempotent production.
"""
NO_INFLIGHT_REQUEST_CORRELATION_ID = -1
# The retry_backoff_ms is overridden to the following value if the first AddPartitions receives a
# CONCURRENT_TRANSACTIONS error.
ADD_PARTITIONS_RETRY_BACKOFF_MS = 20
def __init__(self, transactional_id=None, transaction_timeout_ms=0, retry_backoff_ms=100, api_version=(0, 11), metadata=None):
self._api_version = api_version
self._metadata = metadata
self._sequence_numbers = collections.defaultdict(lambda: 0)
self.transactional_id = transactional_id
self.transaction_timeout_ms = transaction_timeout_ms
self._transaction_coordinator = None
self._consumer_group_coordinator = None
self._new_partitions_in_transaction = set()
self._pending_partitions_in_transaction = set()
self._partitions_in_transaction = set()
self._pending_txn_offset_commits = dict()
self._current_state = TransactionState.UNINITIALIZED
self._last_error = None
self.producer_id_and_epoch = ProducerIdAndEpoch(NO_PRODUCER_ID, NO_PRODUCER_EPOCH)
self._transaction_started = False
self._pending_requests = [] # priority queue via heapq
self._pending_requests_sort_id = 0
self._in_flight_request_correlation_id = self.NO_INFLIGHT_REQUEST_CORRELATION_ID
# This is used by the TxnRequestHandlers to control how long to back off before a given request is retried.
# For instance, this value is lowered by the AddPartitionsToTxnHandler when it receives a CONCURRENT_TRANSACTIONS
# error for the first AddPartitionsRequest in a transaction.
self.retry_backoff_ms = retry_backoff_ms
self._lock = threading.Condition()
def initialize_transactions(self):
with self._lock:
self._ensure_transactional()
self._transition_to(TransactionState.INITIALIZING)
self.set_producer_id_and_epoch(ProducerIdAndEpoch(NO_PRODUCER_ID, NO_PRODUCER_EPOCH))
self._sequence_numbers.clear()
handler = InitProducerIdHandler(self, self.transaction_timeout_ms)
self._enqueue_request(handler)
return handler.result
def begin_transaction(self):
with self._lock:
self._ensure_transactional()
self._maybe_fail_with_error()
self._transition_to(TransactionState.IN_TRANSACTION)
def begin_commit(self):
with self._lock:
self._ensure_transactional()
self._maybe_fail_with_error()
self._transition_to(TransactionState.COMMITTING_TRANSACTION)
return self._begin_completing_transaction(True)
def begin_abort(self):
with self._lock:
self._ensure_transactional()
if self._current_state != TransactionState.ABORTABLE_ERROR:
self._maybe_fail_with_error()
self._transition_to(TransactionState.ABORTING_TRANSACTION)
# We're aborting the transaction, so there should be no need to add new partitions
self._new_partitions_in_transaction.clear()
return self._begin_completing_transaction(False)
def _begin_completing_transaction(self, committed):
if self._new_partitions_in_transaction:
self._enqueue_request(self._add_partitions_to_transaction_handler())
handler = EndTxnHandler(self, committed)
self._enqueue_request(handler)
return handler.result
def send_offsets_to_transaction(self, offsets, consumer_group_id):
with self._lock:
self._ensure_transactional()
self._maybe_fail_with_error()
if self._current_state != TransactionState.IN_TRANSACTION:
raise Errors.KafkaError("Cannot send offsets to transaction because the producer is not in an active transaction")
log.debug("Begin adding offsets %s for consumer group %s to transaction", offsets, consumer_group_id)
handler = AddOffsetsToTxnHandler(self, consumer_group_id, offsets)
self._enqueue_request(handler)
return handler.result
def maybe_add_partition_to_transaction(self, topic_partition):
with self._lock:
self._fail_if_not_ready_for_send()
if self.is_partition_added(topic_partition) or self.is_partition_pending_add(topic_partition):
return
log.debug("Begin adding new partition %s to transaction", topic_partition)
self._new_partitions_in_transaction.add(topic_partition)
def _fail_if_not_ready_for_send(self):
with self._lock:
if self.has_error():
raise Errors.KafkaError(
"Cannot perform send because at least one previous transactional or"
" idempotent request has failed with errors.", self._last_error)
if self.is_transactional():
if not self.has_producer_id():
raise Errors.IllegalStateError(
"Cannot perform a 'send' before completing a call to init_transactions"
" when transactions are enabled.")
if self._current_state != TransactionState.IN_TRANSACTION:
raise Errors.IllegalStateError("Cannot call send in state %s" % (self._current_state.name,))
def is_send_to_partition_allowed(self, tp):
with self._lock:
if self.has_fatal_error():
return False
return not self.is_transactional() or tp in self._partitions_in_transaction
def has_producer_id(self, producer_id=None):
if producer_id is None:
return self.producer_id_and_epoch.is_valid
else:
return self.producer_id_and_epoch.producer_id == producer_id
def is_transactional(self):
return self.transactional_id is not None
def has_partitions_to_add(self):
with self._lock:
return bool(self._new_partitions_in_transaction) or bool(self._pending_partitions_in_transaction)
def is_completing(self):
with self._lock:
return self._current_state in (
TransactionState.COMMITTING_TRANSACTION,
TransactionState.ABORTING_TRANSACTION)
@property
def last_error(self):
return self._last_error
def has_error(self):
with self._lock:
return self._current_state in (
TransactionState.ABORTABLE_ERROR,
TransactionState.FATAL_ERROR)
def is_aborting(self):
with self._lock:
return self._current_state == TransactionState.ABORTING_TRANSACTION
def transition_to_abortable_error(self, exc):
with self._lock:
if self._current_state == TransactionState.ABORTING_TRANSACTION:
log.debug("Skipping transition to abortable error state since the transaction is already being "
" aborted. Underlying exception: %s", exc)
return
self._transition_to(TransactionState.ABORTABLE_ERROR, error=exc)
def transition_to_fatal_error(self, exc):
with self._lock:
self._transition_to(TransactionState.FATAL_ERROR, error=exc)
def is_partition_added(self, partition):
with self._lock:
return partition in self._partitions_in_transaction
def is_partition_pending_add(self, partition):
return partition in self._new_partitions_in_transaction or partition in self._pending_partitions_in_transaction
def has_producer_id_and_epoch(self, producer_id, producer_epoch):
return (
self.producer_id_and_epoch.producer_id == producer_id and
self.producer_id_and_epoch.epoch == producer_epoch
)
def set_producer_id_and_epoch(self, producer_id_and_epoch):
if not isinstance(producer_id_and_epoch, ProducerIdAndEpoch):
raise TypeError("ProducerAndIdEpoch type required")
log.info("ProducerId set to %s with epoch %s",
producer_id_and_epoch.producer_id, producer_id_and_epoch.epoch)
self.producer_id_and_epoch = producer_id_and_epoch
def reset_producer_id(self):
"""
This method is used when the producer needs to reset its internal state because of an irrecoverable exception
from the broker.
We need to reset the producer id and associated state when we have sent a batch to the broker, but we either get
a non-retriable exception or we run out of retries, or the batch expired in the producer queue after it was already
sent to the broker.
In all of these cases, we don't know whether batch was actually committed on the broker, and hence whether the
sequence number was actually updated. If we don't reset the producer state, we risk the chance that all future
messages will return an OutOfOrderSequenceNumberError.
Note that we can't reset the producer state for the transactional producer as this would mean bumping the epoch
for the same producer id. This might involve aborting the ongoing transaction during the initProducerIdRequest,
and the user would not have any way of knowing this happened. So for the transactional producer,
it's best to return the produce error to the user and let them abort the transaction and close the producer explicitly.
"""
with self._lock:
if self.is_transactional():
raise Errors.IllegalStateError(
"Cannot reset producer state for a transactional producer."
" You must either abort the ongoing transaction or"
" reinitialize the transactional producer instead")
self.set_producer_id_and_epoch(ProducerIdAndEpoch(NO_PRODUCER_ID, NO_PRODUCER_EPOCH))
self._sequence_numbers.clear()
def sequence_number(self, tp):
with self._lock:
return self._sequence_numbers[tp]
def increment_sequence_number(self, tp, increment):
with self._lock:
if tp not in self._sequence_numbers:
raise Errors.IllegalStateError("Attempt to increment sequence number for a partition with no current sequence.")
# Sequence number wraps at java max int
base = self._sequence_numbers[tp]
if base > (2147483647 - increment):
self._sequence_numbers[tp] = increment - (2147483647 - base) - 1
else:
self._sequence_numbers[tp] += increment
def next_request_handler(self, has_incomplete_batches):
with self._lock:
if self._new_partitions_in_transaction:
self._enqueue_request(self._add_partitions_to_transaction_handler())
if not self._pending_requests:
return None
_, _, next_request_handler = self._pending_requests[0]
# Do not send the EndTxn until all batches have been flushed
if isinstance(next_request_handler, EndTxnHandler) and has_incomplete_batches:
return None
heapq.heappop(self._pending_requests)
if self._maybe_terminate_request_with_error(next_request_handler):
log.debug("Not sending transactional request %s because we are in an error state",
next_request_handler.request)
return None
if isinstance(next_request_handler, EndTxnHandler) and not self._transaction_started:
next_request_handler.result.done()
if self._current_state != TransactionState.FATAL_ERROR:
log.debug("Not sending EndTxn for completed transaction since no partitions"
" or offsets were successfully added")
self._complete_transaction()
try:
_, _, next_request_handler = heapq.heappop(self._pending_requests)
except IndexError:
next_request_handler = None
if next_request_handler:
log.debug("Request %s dequeued for sending", next_request_handler.request)
return next_request_handler
def retry(self, request):
with self._lock:
request.set_retry()
self._enqueue_request(request)
def authentication_failed(self, exc):
with self._lock:
for _, _, request in self._pending_requests:
request.fatal_error(exc)
def coordinator(self, coord_type):
if coord_type == 'group':
return self._consumer_group_coordinator
elif coord_type == 'transaction':
return self._transaction_coordinator
else:
raise Errors.IllegalStateError("Received an invalid coordinator type: %s" % (coord_type,))
def lookup_coordinator_for_request(self, request):
self._lookup_coordinator(request.coordinator_type, request.coordinator_key)
def next_in_flight_request_correlation_id(self):
self._in_flight_request_correlation_id += 1
return self._in_flight_request_correlation_id
def clear_in_flight_transactional_request_correlation_id(self):
self._in_flight_request_correlation_id = self.NO_INFLIGHT_REQUEST_CORRELATION_ID
def has_in_flight_transactional_request(self):
return self._in_flight_request_correlation_id != self.NO_INFLIGHT_REQUEST_CORRELATION_ID
def has_fatal_error(self):
return self._current_state == TransactionState.FATAL_ERROR
def has_abortable_error(self):
return self._current_state == TransactionState.ABORTABLE_ERROR
# visible for testing
def _test_transaction_contains_partition(self, tp):
with self._lock:
return tp in self._partitions_in_transaction
# visible for testing
def _test_has_pending_offset_commits(self):
return bool(self._pending_txn_offset_commits)
# visible for testing
def _test_has_ongoing_transaction(self):
with self._lock:
# transactions are considered ongoing once started until completion or a fatal error
return self._current_state == TransactionState.IN_TRANSACTION or self.is_completing() or self.has_abortable_error()
# visible for testing
def _test_is_ready(self):
with self._lock:
return self.is_transactional() and self._current_state == TransactionState.READY
def _transition_to(self, target, error=None):
with self._lock:
if not self._current_state.is_transition_valid(self._current_state, target):
raise Errors.KafkaError("TransactionalId %s: Invalid transition attempted from state %s to state %s" % (
self.transactional_id, self._current_state.name, target.name))
if target in (TransactionState.FATAL_ERROR, TransactionState.ABORTABLE_ERROR):
if error is None:
raise Errors.IllegalArgumentError("Cannot transition to %s with an None exception" % (target.name,))
self._last_error = error
else:
self._last_error = None
if self._last_error is not None:
log.debug("Transition from state %s to error state %s (%s)", self._current_state.name, target.name, self._last_error)
else:
log.debug("Transition from state %s to %s", self._current_state, target)
self._current_state = target
def _ensure_transactional(self):
if not self.is_transactional():
raise Errors.IllegalStateError("Transactional method invoked on a non-transactional producer.")
def _maybe_fail_with_error(self):
if self.has_error():
raise Errors.KafkaError("Cannot execute transactional method because we are in an error state: %s" % (self._last_error,))
def _maybe_terminate_request_with_error(self, request_handler):
if self.has_error():
if self.has_abortable_error() and isinstance(request_handler, FindCoordinatorHandler):
# No harm letting the FindCoordinator request go through if we're expecting to abort
return False
request_handler.fail(self._last_error)
return True
return False
def _next_pending_requests_sort_id(self):
self._pending_requests_sort_id += 1
return self._pending_requests_sort_id
def _enqueue_request(self, request_handler):
log.debug("Enqueuing transactional request %s", request_handler.request)
heapq.heappush(
self._pending_requests,
(
request_handler.priority, # keep lowest priority at head of queue
self._next_pending_requests_sort_id(), # break ties
request_handler
)
)
def _lookup_coordinator(self, coord_type, coord_key):
with self._lock:
if coord_type == 'group':
self._consumer_group_coordinator = None
elif coord_type == 'transaction':
self._transaction_coordinator = None
else:
raise Errors.IllegalStateError("Invalid coordinator type: %s" % (coord_type,))
self._enqueue_request(FindCoordinatorHandler(self, coord_type, coord_key))
def _complete_transaction(self):
with self._lock:
self._transition_to(TransactionState.READY)
self._transaction_started = False
self._new_partitions_in_transaction.clear()
self._pending_partitions_in_transaction.clear()
self._partitions_in_transaction.clear()
def _add_partitions_to_transaction_handler(self):
with self._lock:
self._pending_partitions_in_transaction.update(self._new_partitions_in_transaction)
self._new_partitions_in_transaction.clear()
return AddPartitionsToTxnHandler(self, self._pending_partitions_in_transaction)
class TransactionalRequestResult(object):
def __init__(self):
self._latch = threading.Event()
self._error = None
def done(self, error=None):
self._error = error
self._latch.set()
def wait(self, timeout_ms=None):
timeout = timeout_ms / 1000 if timeout_ms is not None else None
success = self._latch.wait(timeout)
if self._error:
raise self._error
return success
@property
def is_done(self):
return self._latch.is_set()
@property
def succeeded(self):
return self._latch.is_set() and self._error is None
@property
def failed(self):
return self._latch.is_set() and self._error is not None
@property
def exception(self):
return self._error
@six.add_metaclass(abc.ABCMeta)
class TxnRequestHandler(object):
def __init__(self, transaction_manager, result=None):
self.transaction_manager = transaction_manager
self.retry_backoff_ms = transaction_manager.retry_backoff_ms
self.request = None
self._result = result or TransactionalRequestResult()
self._is_retry = False
@property
def transactional_id(self):
return self.transaction_manager.transactional_id
@property
def producer_id(self):
return self.transaction_manager.producer_id_and_epoch.producer_id
@property
def producer_epoch(self):
return self.transaction_manager.producer_id_and_epoch.epoch
def fatal_error(self, exc):
self.transaction_manager.transition_to_fatal_error(exc)
self._result.done(error=exc)
def abortable_error(self, exc):
self.transaction_manager.transition_to_abortable_error(exc)
self._result.done(error=exc)
def fail(self, exc):
self._result.done(error=exc)
def reenqueue(self):
with self.transaction_manager._lock:
self._is_retry = True
self.transaction_manager._enqueue_request(self)
def on_complete(self, correlation_id, response_or_exc):
if correlation_id != self.transaction_manager._in_flight_request_correlation_id:
self.fatal_error(RuntimeError("Detected more than one in-flight transactional request."))
else:
self.transaction_manager.clear_in_flight_transactional_request_correlation_id()
if isinstance(response_or_exc, Errors.KafkaConnectionError):
log.debug("Disconnected from node. Will retry.")
if self.needs_coordinator():
self.transaction_manager._lookup_coordinator(self.coordinator_type, self.coordinator_key)
self.reenqueue()
elif isinstance(response_or_exc, Errors.UnsupportedVersionError):
self.fatal_error(response_or_exc)
elif not isinstance(response_or_exc, (Exception, type(None))):
log.debug("Received transactional response %s for request %s", response_or_exc, self.request)
with self.transaction_manager._lock:
self.handle_response(response_or_exc)
else:
self.fatal_error(Errors.KafkaError("Could not execute transactional request for unknown reasons: %s" % response_or_exc))
def needs_coordinator(self):
return self.coordinator_type is not None
@property
def result(self):
return self._result
@property
def coordinator_type(self):
return 'transaction'
@property
def coordinator_key(self):
return self.transaction_manager.transactional_id
def set_retry(self):
self._is_retry = True
@property
def is_retry(self):
return self._is_retry
@abc.abstractmethod
def handle_response(self, response):
pass
@abc.abstractproperty
def priority(self):
pass
class InitProducerIdHandler(TxnRequestHandler):
def __init__(self, transaction_manager, transaction_timeout_ms):
super(InitProducerIdHandler, self).__init__(transaction_manager)
if transaction_manager._api_version >= (2, 0):
version = 1
else:
version = 0
self.request = InitProducerIdRequest[version](
transactional_id=self.transactional_id,
transaction_timeout_ms=transaction_timeout_ms)
@property
def priority(self):
return Priority.INIT_PRODUCER_ID
def handle_response(self, response):
error = Errors.for_code(response.error_code)
if error is Errors.NoError:
self.transaction_manager.set_producer_id_and_epoch(ProducerIdAndEpoch(response.producer_id, response.producer_epoch))
self.transaction_manager._transition_to(TransactionState.READY)
self._result.done()
elif error in (Errors.NotCoordinatorError, Errors.CoordinatorNotAvailableError):
self.transaction_manager._lookup_coordinator('transaction', self.transactional_id)
self.reenqueue()
elif error in (Errors.CoordinatorLoadInProgressError, Errors.ConcurrentTransactionsError):
self.reenqueue()
elif error is Errors.TransactionalIdAuthorizationFailedError:
self.fatal_error(error())
else:
self.fatal_error(Errors.KafkaError("Unexpected error in InitProducerIdResponse: %s" % (error())))
class AddPartitionsToTxnHandler(TxnRequestHandler):
def __init__(self, transaction_manager, topic_partitions):
super(AddPartitionsToTxnHandler, self).__init__(transaction_manager)
if transaction_manager._api_version >= (2, 7):
version = 2
elif transaction_manager._api_version >= (2, 0):
version = 1
else:
version = 0
topic_data = collections.defaultdict(list)
for tp in topic_partitions:
topic_data[tp.topic].append(tp.partition)
self.request = AddPartitionsToTxnRequest[version](
transactional_id=self.transactional_id,
producer_id=self.producer_id,
producer_epoch=self.producer_epoch,
topics=list(topic_data.items()))
@property
def priority(self):
return Priority.ADD_PARTITIONS_OR_OFFSETS
def handle_response(self, response):
has_partition_errors = False
unauthorized_topics = set()
self.retry_backoff_ms = self.transaction_manager.retry_backoff_ms
results = {TopicPartition(topic, partition): Errors.for_code(error_code)
for topic, partition_data in response.results
for partition, error_code in partition_data}
for tp, error in six.iteritems(results):
if error is Errors.NoError:
continue
elif error in (Errors.CoordinatorNotAvailableError, Errors.NotCoordinatorError):
self.transaction_manager._lookup_coordinator('transaction', self.transactional_id)
self.reenqueue()
return
elif error is Errors.ConcurrentTransactionsError:
self.maybe_override_retry_backoff_ms()
self.reenqueue()
return
elif error in (Errors.CoordinatorLoadInProgressError, Errors.UnknownTopicOrPartitionError):
self.reenqueue()
return
elif error is Errors.InvalidProducerEpochError:
self.fatal_error(error())
return
elif error is Errors.TransactionalIdAuthorizationFailedError:
self.fatal_error(error())
return
elif error in (Errors.InvalidProducerIdMappingError, Errors.InvalidTxnStateError):
self.fatal_error(Errors.KafkaError(error()))
return
elif error is Errors.TopicAuthorizationFailedError:
unauthorized_topics.add(tp.topic)
elif error is Errors.OperationNotAttemptedError:
log.debug("Did not attempt to add partition %s to transaction because other partitions in the"
" batch had errors.", tp)
has_partition_errors = True
else:
log.error("Could not add partition %s due to unexpected error %s", tp, error())
has_partition_errors = True
partitions = set(results)
# Remove the partitions from the pending set regardless of the result. We use the presence
# of partitions in the pending set to know when it is not safe to send batches. However, if
# the partitions failed to be added and we enter an error state, we expect the batches to be
# aborted anyway. In this case, we must be able to continue sending the batches which are in
# retry for partitions that were successfully added.
self.transaction_manager._pending_partitions_in_transaction -= partitions
if unauthorized_topics:
self.abortable_error(Errors.TopicAuthorizationFailedError(unauthorized_topics))
elif has_partition_errors:
self.abortable_error(Errors.KafkaError("Could not add partitions to transaction due to errors: %s" % (results)))
else:
log.debug("Successfully added partitions %s to transaction", partitions)
self.transaction_manager._partitions_in_transaction.update(partitions)
self.transaction_manager._transaction_started = True
self._result.done()
def maybe_override_retry_backoff_ms(self):
# We only want to reduce the backoff when retrying the first AddPartition which errored out due to a
# CONCURRENT_TRANSACTIONS error since this means that the previous transaction is still completing and
# we don't want to wait too long before trying to start the new one.
#
# This is only a temporary fix, the long term solution is being tracked in
# https://issues.apache.org/jira/browse/KAFKA-5482
if not self.transaction_manager._partitions_in_transaction:
self.retry_backoff_ms = min(self.transaction_manager.ADD_PARTITIONS_RETRY_BACKOFF_MS, self.retry_backoff_ms)
class FindCoordinatorHandler(TxnRequestHandler):
def __init__(self, transaction_manager, coord_type, coord_key):
super(FindCoordinatorHandler, self).__init__(transaction_manager)
self._coord_type = coord_type
self._coord_key = coord_key
if transaction_manager._api_version >= (2, 0):
version = 2
else:
version = 1
if coord_type == 'group':
coord_type_int8 = 0
elif coord_type == 'transaction':
coord_type_int8 = 1
else:
raise ValueError("Unrecognized coordinator type: %s" % (coord_type,))
self.request = FindCoordinatorRequest[version](
coordinator_key=coord_key,
coordinator_type=coord_type_int8,
)
@property
def priority(self):
return Priority.FIND_COORDINATOR
@property
def coordinator_type(self):
return None
@property
def coordinator_key(self):
return None
def handle_response(self, response):
error = Errors.for_code(response.error_code)
if error is Errors.NoError:
coordinator_id = self.transaction_manager._metadata.add_coordinator(
response, self._coord_type, self._coord_key)
if self._coord_type == 'group':
self.transaction_manager._consumer_group_coordinator = coordinator_id
elif self._coord_type == 'transaction':
self.transaction_manager._transaction_coordinator = coordinator_id
self._result.done()
elif error is Errors.CoordinatorNotAvailableError:
self.reenqueue()
elif error is Errors.TransactionalIdAuthorizationFailedError:
self.fatal_error(error())
elif error is Errors.GroupAuthorizationFailedError:
self.abortable_error(error(self._coord_key))
else:
self.fatal_error(Errors.KafkaError(
"Could not find a coordinator with type %s with key %s due to"
" unexpected error: %s" % (self._coord_type, self._coord_key, error())))
class EndTxnHandler(TxnRequestHandler):
def __init__(self, transaction_manager, committed):
super(EndTxnHandler, self).__init__(transaction_manager)
if self.transaction_manager._api_version >= (2, 7):
version = 2
elif self.transaction_manager._api_version >= (2, 0):
version = 1
else:
version = 0
self.request = EndTxnRequest[version](
transactional_id=self.transactional_id,
producer_id=self.producer_id,
producer_epoch=self.producer_epoch,
committed=committed)
@property
def priority(self):
return Priority.END_TXN
def handle_response(self, response):
error = Errors.for_code(response.error_code)
if error is Errors.NoError:
self.transaction_manager._complete_transaction()
self._result.done()
elif error in (Errors.CoordinatorNotAvailableError, Errors.NotCoordinatorError):
self.transaction_manager._lookup_coordinator('transaction', self.transactional_id)
self.reenqueue()
elif error in (Errors.CoordinatorLoadInProgressError, Errors.ConcurrentTransactionsError):
self.reenqueue()
elif error is Errors.InvalidProducerEpochError:
self.fatal_error(error())
elif error is Errors.TransactionalIdAuthorizationFailedError:
self.fatal_error(error())
elif error is Errors.InvalidTxnStateError:
self.fatal_error(error())
else:
self.fatal_error(Errors.KafkaError("Unhandled error in EndTxnResponse: %s" % (error())))
class AddOffsetsToTxnHandler(TxnRequestHandler):
def __init__(self, transaction_manager, consumer_group_id, offsets):
super(AddOffsetsToTxnHandler, self).__init__(transaction_manager)
self.consumer_group_id = consumer_group_id
self.offsets = offsets
if self.transaction_manager._api_version >= (2, 7):
version = 2
elif self.transaction_manager._api_version >= (2, 0):
version = 1
else:
version = 0
self.request = AddOffsetsToTxnRequest[version](
transactional_id=self.transactional_id,
producer_id=self.producer_id,
producer_epoch=self.producer_epoch,
group_id=consumer_group_id)
@property
def priority(self):
return Priority.ADD_PARTITIONS_OR_OFFSETS
def handle_response(self, response):
error = Errors.for_code(response.error_code)
if error is Errors.NoError:
log.debug("Successfully added partition for consumer group %s to transaction", self.consumer_group_id)
# note the result is not completed until the TxnOffsetCommit returns
for tp, offset in six.iteritems(self.offsets):
self.transaction_manager._pending_txn_offset_commits[tp] = offset
handler = TxnOffsetCommitHandler(self.transaction_manager, self.consumer_group_id,
self.transaction_manager._pending_txn_offset_commits, self._result)
self.transaction_manager._enqueue_request(handler)
self.transaction_manager._transaction_started = True
elif error in (Errors.CoordinatorNotAvailableError, Errors.NotCoordinatorError):
self.transaction_manager._lookup_coordinator('transaction', self.transactional_id)
self.reenqueue()
elif error in (Errors.CoordinatorLoadInProgressError, Errors.ConcurrentTransactionsError):
self.reenqueue()
elif error is Errors.InvalidProducerEpochError:
self.fatal_error(error())
elif error is Errors.TransactionalIdAuthorizationFailedError:
self.fatal_error(error())
elif error is Errors.GroupAuthorizationFailedError:
self.abortable_error(error(self.consumer_group_id))
else:
self.fatal_error(Errors.KafkaError("Unexpected error in AddOffsetsToTxnResponse: %s" % (error())))
class TxnOffsetCommitHandler(TxnRequestHandler):
def __init__(self, transaction_manager, consumer_group_id, offsets, result):
super(TxnOffsetCommitHandler, self).__init__(transaction_manager, result=result)
self.consumer_group_id = consumer_group_id
self.offsets = offsets
self.request = self._build_request()
def _build_request(self):
if self.transaction_manager._api_version >= (2, 1):
version = 2
elif self.transaction_manager._api_version >= (2, 0):
version = 1
else:
version = 0
topic_data = collections.defaultdict(list)
for tp, offset in six.iteritems(self.offsets):
if version >= 2:
partition_data = (tp.partition, offset.offset, offset.leader_epoch, offset.metadata)
else:
partition_data = (tp.partition, offset.offset, offset.metadata)
topic_data[tp.topic].append(partition_data)
return TxnOffsetCommitRequest[version](
transactional_id=self.transactional_id,
group_id=self.consumer_group_id,
producer_id=self.producer_id,
producer_epoch=self.producer_epoch,
topics=list(topic_data.items()))
@property
def priority(self):
return Priority.ADD_PARTITIONS_OR_OFFSETS
@property
def coordinator_type(self):
return 'group'
@property
def coordinator_key(self):
return self.consumer_group_id
def handle_response(self, response):
lookup_coordinator = False
retriable_failure = False
errors = {TopicPartition(topic, partition): Errors.for_code(error_code)
for topic, partition_data in response.topics
for partition, error_code in partition_data}
for tp, error in six.iteritems(errors):
if error is Errors.NoError:
log.debug("Successfully added offsets for %s from consumer group %s to transaction.",
tp, self.consumer_group_id)
del self.transaction_manager._pending_txn_offset_commits[tp]
elif error in (errors.CoordinatorNotAvailableError, Errors.NotCoordinatorError, Errors.RequestTimedOutError):
retriable_failure = True
lookup_coordinator = True
elif error is Errors.UnknownTopicOrPartitionError:
retriable_failure = True
elif error is Errors.GroupAuthorizationFailedError:
self.abortable_error(error(self.consumer_group_id))
return
elif error in (Errors.TransactionalIdAuthorizationFailedError,
Errors.InvalidProducerEpochError,
Errors.UnsupportedForMessageFormatError):
self.fatal_error(error())
return
else:
self.fatal_error(Errors.KafkaError("Unexpected error in TxnOffsetCommitResponse: %s" % (error())))
return
if lookup_coordinator:
self.transaction_manager._lookup_coordinator('group', self.consumer_group_id)
if not retriable_failure:
# all attempted partitions were either successful, or there was a fatal failure.
# either way, we are not retrying, so complete the request.
self.result.done()
# retry the commits which failed with a retriable error.
elif self.transaction_manager._pending_txn_offset_commits:
self.offsets = self.transaction_manager._pending_txn_offset_commits
self.request = self._build_request()
self.reenqueue()