This commit is contained in:
115
venv/lib/python3.12/site-packages/kafka/producer/buffer.py
Normal file
115
venv/lib/python3.12/site-packages/kafka/producer/buffer.py
Normal file
@@ -0,0 +1,115 @@
|
||||
from __future__ import absolute_import, division
|
||||
|
||||
import collections
|
||||
import io
|
||||
import threading
|
||||
import time
|
||||
|
||||
from kafka.metrics.stats import Rate
|
||||
|
||||
import kafka.errors as Errors
|
||||
|
||||
|
||||
class SimpleBufferPool(object):
|
||||
"""A simple pool of BytesIO objects with a weak memory ceiling."""
|
||||
def __init__(self, memory, poolable_size, metrics=None, metric_group_prefix='producer-metrics'):
|
||||
"""Create a new buffer pool.
|
||||
|
||||
Arguments:
|
||||
memory (int): maximum memory that this buffer pool can allocate
|
||||
poolable_size (int): memory size per buffer to cache in the free
|
||||
list rather than deallocating
|
||||
"""
|
||||
self._poolable_size = poolable_size
|
||||
self._lock = threading.RLock()
|
||||
|
||||
buffers = int(memory / poolable_size) if poolable_size else 0
|
||||
self._free = collections.deque([io.BytesIO() for _ in range(buffers)])
|
||||
|
||||
self._waiters = collections.deque()
|
||||
self.wait_time = None
|
||||
if metrics:
|
||||
self.wait_time = metrics.sensor('bufferpool-wait-time')
|
||||
self.wait_time.add(metrics.metric_name(
|
||||
'bufferpool-wait-ratio', metric_group_prefix,
|
||||
'The fraction of time an appender waits for space allocation.'),
|
||||
Rate())
|
||||
|
||||
def allocate(self, size, max_time_to_block_ms):
|
||||
"""
|
||||
Allocate a buffer of the given size. This method blocks if there is not
|
||||
enough memory and the buffer pool is configured with blocking mode.
|
||||
|
||||
Arguments:
|
||||
size (int): The buffer size to allocate in bytes [ignored]
|
||||
max_time_to_block_ms (int): The maximum time in milliseconds to
|
||||
block for buffer memory to be available
|
||||
|
||||
Returns:
|
||||
io.BytesIO
|
||||
"""
|
||||
with self._lock:
|
||||
# check if we have a free buffer of the right size pooled
|
||||
if self._free:
|
||||
return self._free.popleft()
|
||||
|
||||
elif self._poolable_size == 0:
|
||||
return io.BytesIO()
|
||||
|
||||
else:
|
||||
# we are out of buffers and will have to block
|
||||
buf = None
|
||||
more_memory = threading.Condition(self._lock)
|
||||
self._waiters.append(more_memory)
|
||||
# loop over and over until we have a buffer or have reserved
|
||||
# enough memory to allocate one
|
||||
while buf is None:
|
||||
start_wait = time.time()
|
||||
more_memory.wait(max_time_to_block_ms / 1000.0)
|
||||
end_wait = time.time()
|
||||
if self.wait_time:
|
||||
self.wait_time.record(end_wait - start_wait)
|
||||
|
||||
if self._free:
|
||||
buf = self._free.popleft()
|
||||
else:
|
||||
self._waiters.remove(more_memory)
|
||||
raise Errors.KafkaTimeoutError(
|
||||
"Failed to allocate memory within the configured"
|
||||
" max blocking time")
|
||||
|
||||
# remove the condition for this thread to let the next thread
|
||||
# in line start getting memory
|
||||
removed = self._waiters.popleft()
|
||||
assert removed is more_memory, 'Wrong condition'
|
||||
|
||||
# signal any additional waiters if there is more memory left
|
||||
# over for them
|
||||
if self._free and self._waiters:
|
||||
self._waiters[0].notify()
|
||||
|
||||
# unlock and return the buffer
|
||||
return buf
|
||||
|
||||
def deallocate(self, buf):
|
||||
"""
|
||||
Return buffers to the pool. If they are of the poolable size add them
|
||||
to the free list, otherwise just mark the memory as free.
|
||||
|
||||
Arguments:
|
||||
buffer_ (io.BytesIO): The buffer to return
|
||||
"""
|
||||
with self._lock:
|
||||
# BytesIO.truncate here makes the pool somewhat pointless
|
||||
# but we stick with the BufferPool API until migrating to
|
||||
# bytesarray / memoryview. The buffer we return must not
|
||||
# expose any prior data on read().
|
||||
buf.truncate(0)
|
||||
self._free.append(buf)
|
||||
if self._waiters:
|
||||
self._waiters[0].notify()
|
||||
|
||||
def queued(self):
|
||||
"""The number of threads blocked waiting on memory."""
|
||||
with self._lock:
|
||||
return len(self._waiters)
|
||||
@@ -38,7 +38,7 @@ class FutureRecordMetadata(Future):
|
||||
produce_future.add_errback(self.failure)
|
||||
|
||||
def _produce_success(self, offset_and_timestamp):
|
||||
offset, produce_timestamp_ms = offset_and_timestamp
|
||||
offset, produce_timestamp_ms, log_start_offset = offset_and_timestamp
|
||||
|
||||
# Unpacking from args tuple is minor speed optimization
|
||||
(relative_offset, timestamp_ms, checksum,
|
||||
@@ -51,7 +51,7 @@ class FutureRecordMetadata(Future):
|
||||
if offset != -1 and relative_offset is not None:
|
||||
offset += relative_offset
|
||||
tp = self._produce_future.topic_partition
|
||||
metadata = RecordMetadata(tp[0], tp[1], tp, offset, timestamp_ms,
|
||||
metadata = RecordMetadata(tp[0], tp[1], tp, offset, timestamp_ms, log_start_offset,
|
||||
checksum, serialized_key_size,
|
||||
serialized_value_size, serialized_header_size)
|
||||
self.success(metadata)
|
||||
@@ -67,5 +67,5 @@ class FutureRecordMetadata(Future):
|
||||
|
||||
|
||||
RecordMetadata = collections.namedtuple(
|
||||
'RecordMetadata', ['topic', 'partition', 'topic_partition', 'offset', 'timestamp',
|
||||
'RecordMetadata', ['topic', 'partition', 'topic_partition', 'offset', 'timestamp', 'log_start_offset',
|
||||
'checksum', 'serialized_key_size', 'serialized_value_size', 'serialized_header_size'])
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
from __future__ import absolute_import, division
|
||||
from __future__ import absolute_import
|
||||
|
||||
import atexit
|
||||
import copy
|
||||
import logging
|
||||
import socket
|
||||
import threading
|
||||
import warnings
|
||||
import time
|
||||
import weakref
|
||||
|
||||
from kafka.vendor import six
|
||||
@@ -18,12 +18,10 @@ from kafka.partitioner.default import DefaultPartitioner
|
||||
from kafka.producer.future import FutureRecordMetadata, FutureProduceResult
|
||||
from kafka.producer.record_accumulator import AtomicInteger, RecordAccumulator
|
||||
from kafka.producer.sender import Sender
|
||||
from kafka.producer.transaction_manager import TransactionManager
|
||||
from kafka.record.default_records import DefaultRecordBatchBuilder
|
||||
from kafka.record.legacy_records import LegacyRecordBatchBuilder
|
||||
from kafka.serializer import Serializer
|
||||
from kafka.structs import TopicPartition
|
||||
from kafka.util import Timer, ensure_valid_topic_name
|
||||
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
@@ -36,8 +34,8 @@ class KafkaProducer(object):
|
||||
The producer is thread safe and sharing a single producer instance across
|
||||
threads will generally be faster than having multiple instances.
|
||||
|
||||
The producer consists of a RecordAccumulator which holds records that
|
||||
haven't yet been transmitted to the server, and a Sender background I/O
|
||||
The producer consists of a pool of buffer space that holds records that
|
||||
haven't yet been transmitted to the server as well as a background I/O
|
||||
thread that is responsible for turning these records into requests and
|
||||
transmitting them to the cluster.
|
||||
|
||||
@@ -73,50 +71,14 @@ class KafkaProducer(object):
|
||||
can lead to fewer, more efficient requests when not under maximal load at
|
||||
the cost of a small amount of latency.
|
||||
|
||||
The buffer_memory controls the total amount of memory available to the
|
||||
producer for buffering. If records are sent faster than they can be
|
||||
transmitted to the server then this buffer space will be exhausted. When
|
||||
the buffer space is exhausted additional send calls will block.
|
||||
|
||||
The key_serializer and value_serializer instruct how to turn the key and
|
||||
value objects the user provides into bytes.
|
||||
|
||||
From Kafka 0.11, the KafkaProducer supports two additional modes:
|
||||
the idempotent producer and the transactional producer.
|
||||
The idempotent producer strengthens Kafka's delivery semantics from
|
||||
at least once to exactly once delivery. In particular, producer retries
|
||||
will no longer introduce duplicates. The transactional producer allows an
|
||||
application to send messages to multiple partitions (and topics!)
|
||||
atomically.
|
||||
|
||||
To enable idempotence, the `enable_idempotence` configuration must be set
|
||||
to True. If set, the `retries` config will default to `float('inf')` and
|
||||
the `acks` config will default to 'all'. There are no API changes for the
|
||||
idempotent producer, so existing applications will not need to be modified
|
||||
to take advantage of this feature.
|
||||
|
||||
To take advantage of the idempotent producer, it is imperative to avoid
|
||||
application level re-sends since these cannot be de-duplicated. As such, if
|
||||
an application enables idempotence, it is recommended to leave the
|
||||
`retries` config unset, as it will be defaulted to `float('inf')`.
|
||||
Additionally, if a :meth:`~kafka.KafkaProducer.send` returns an error even
|
||||
with infinite retries (for instance if the message expires in the buffer
|
||||
before being sent), then it is recommended to shut down the producer and
|
||||
check the contents of the last produced message to ensure that it is not
|
||||
duplicated. Finally, the producer can only guarantee idempotence for
|
||||
messages sent within a single session.
|
||||
|
||||
To use the transactional producer and the attendant APIs, you must set the
|
||||
`transactional_id` configuration property. If the `transactional_id` is
|
||||
set, idempotence is automatically enabled along with the producer configs
|
||||
which idempotence depends on. Further, topics which are included in
|
||||
transactions should be configured for durability. In particular, the
|
||||
`replication.factor` should be at least `3`, and the `min.insync.replicas`
|
||||
for these topics should be set to 2. Finally, in order for transactional
|
||||
guarantees to be realized from end-to-end, the consumers must be
|
||||
configured to read only committed messages as well.
|
||||
|
||||
The purpose of the `transactional_id` is to enable transaction recovery
|
||||
across multiple sessions of a single producer instance. It would typically
|
||||
be derived from the shard identifier in a partitioned, stateful,
|
||||
application. As such, it should be unique to each producer instance running
|
||||
within a partitioned application.
|
||||
|
||||
Keyword Arguments:
|
||||
bootstrap_servers: 'host[:port]' string (or list of 'host[:port]'
|
||||
strings) that the producer should contact to bootstrap initial
|
||||
@@ -134,28 +96,6 @@ class KafkaProducer(object):
|
||||
value_serializer (callable): used to convert user-supplied message
|
||||
values to bytes. If not None, called as f(value), should return
|
||||
bytes. Default: None.
|
||||
enable_idempotence (bool): When set to True, the producer will ensure
|
||||
that exactly one copy of each message is written in the stream.
|
||||
If False, producer retries due to broker failures, etc., may write
|
||||
duplicates of the retried message in the stream. Default: False.
|
||||
|
||||
Note that enabling idempotence requires
|
||||
`max_in_flight_requests_per_connection` to be set to 1 and `retries`
|
||||
cannot be zero. Additionally, `acks` must be set to 'all'. If these
|
||||
values are left at their defaults, the producer will override the
|
||||
defaults to be suitable. If the values are set to something
|
||||
incompatible with the idempotent producer, a KafkaConfigurationError
|
||||
will be raised.
|
||||
delivery_timeout_ms (float): An upper bound on the time to report success
|
||||
or failure after producer.send() returns. This limits the total time
|
||||
that a record will be delayed prior to sending, the time to await
|
||||
acknowledgement from the broker (if expected), and the time allowed
|
||||
for retriable send failures. The producer may report failure to send
|
||||
a record earlier than this config if either an unrecoverable error is
|
||||
encountered, the retries have been exhausted, or the record is added
|
||||
to a batch which reached an earlier delivery expiration deadline.
|
||||
The value of this config should be greater than or equal to the
|
||||
sum of (request_timeout_ms + linger_ms). Default: 120000.
|
||||
acks (0, 1, 'all'): The number of acknowledgments the producer requires
|
||||
the leader to have received before considering a request complete.
|
||||
This controls the durability of records that are sent. The
|
||||
@@ -183,7 +123,7 @@ class KafkaProducer(object):
|
||||
Compression is of full batches of data, so the efficacy of batching
|
||||
will also impact the compression ratio (more batching means better
|
||||
compression). Default: None.
|
||||
retries (numeric): Setting a value greater than zero will cause the client
|
||||
retries (int): Setting a value greater than zero will cause the client
|
||||
to resend any record whose send fails with a potentially transient
|
||||
error. Note that this retry is no different than if the client
|
||||
resent the record upon receiving the error. Allowing retries
|
||||
@@ -191,12 +131,8 @@ class KafkaProducer(object):
|
||||
potentially change the ordering of records because if two batches
|
||||
are sent to a single partition, and the first fails and is retried
|
||||
but the second succeeds, then the records in the second batch may
|
||||
appear first. Note additionally that produce requests will be
|
||||
failed before the number of retries has been exhausted if the timeout
|
||||
configured by delivery_timeout_ms expires first before successful
|
||||
acknowledgement. Users should generally prefer to leave this config
|
||||
unset and instead use delivery_timeout_ms to control retry behavior.
|
||||
Default: float('inf') (infinite)
|
||||
appear first.
|
||||
Default: 0.
|
||||
batch_size (int): Requests sent to brokers will contain multiple
|
||||
batches, one for each partition with data available to be sent.
|
||||
A small batch size will make batching less common and may reduce
|
||||
@@ -229,6 +165,12 @@ class KafkaProducer(object):
|
||||
messages with the same key are assigned to the same partition.
|
||||
When a key is None, the message is delivered to a random partition
|
||||
(filtered to partitions with available leaders only, if possible).
|
||||
buffer_memory (int): The total bytes of memory the producer should use
|
||||
to buffer records waiting to be sent to the server. If records are
|
||||
sent faster than they can be delivered to the server the producer
|
||||
will block up to max_block_ms, raising an exception on timeout.
|
||||
In the current implementation, this setting is an approximation.
|
||||
Default: 33554432 (32MB)
|
||||
connections_max_idle_ms: Close idle connections after the number of
|
||||
milliseconds specified by this config. The broker closes idle
|
||||
connections after connections.max.idle.ms, so this avoids hitting
|
||||
@@ -246,9 +188,6 @@ class KafkaProducer(object):
|
||||
This setting will limit the number of record batches the producer
|
||||
will send in a single request to avoid sending huge requests.
|
||||
Default: 1048576.
|
||||
allow_auto_create_topics (bool): Enable/disable auto topic creation
|
||||
on metadata request. Only available with api_version >= (0, 11).
|
||||
Default: True
|
||||
metadata_max_age_ms (int): The period of time in milliseconds after
|
||||
which we force a refresh of metadata even if we haven't seen any
|
||||
partition leadership changes to proactively discover any new
|
||||
@@ -277,7 +216,7 @@ class KafkaProducer(object):
|
||||
reconnection attempts will continue periodically with this fixed
|
||||
rate. To avoid connection storms, a randomization factor of 0.2
|
||||
will be applied to the backoff resulting in a random range between
|
||||
20% below and 20% above the computed value. Default: 30000.
|
||||
20% below and 20% above the computed value. Default: 1000.
|
||||
max_in_flight_requests_per_connection (int): Requests are pipelined
|
||||
to kafka brokers up to this number of maximum requests per
|
||||
broker connection. Note that if this setting is set to be greater
|
||||
@@ -294,7 +233,7 @@ class KafkaProducer(object):
|
||||
should verify that the certificate matches the brokers hostname.
|
||||
default: true.
|
||||
ssl_cafile (str): optional filename of ca file to use in certificate
|
||||
verification. default: none.
|
||||
veriication. default: none.
|
||||
ssl_certfile (str): optional filename of file in pem format containing
|
||||
the client certificate, as well as any ca certificates needed to
|
||||
establish the certificate's authenticity. default: none.
|
||||
@@ -313,28 +252,14 @@ class KafkaProducer(object):
|
||||
or other configuration forbids use of all the specified ciphers),
|
||||
an ssl.SSLError will be raised. See ssl.SSLContext.set_ciphers
|
||||
api_version (tuple): Specify which Kafka API version to use. If set to
|
||||
None, the client will attempt to determine the broker version via
|
||||
ApiVersionsRequest API or, for brokers earlier than 0.10, probing
|
||||
various known APIs. Dynamic version checking is performed eagerly
|
||||
during __init__ and can raise NoBrokersAvailableError if no connection
|
||||
was made before timeout (see api_version_auto_timeout_ms below).
|
||||
Different versions enable different functionality.
|
||||
|
||||
Examples:
|
||||
(3, 9) most recent broker release, enable all supported features
|
||||
(0, 11) enables message format v2 (internal)
|
||||
(0, 10, 0) enables sasl authentication and message format v1
|
||||
(0, 8, 0) enables basic functionality only
|
||||
|
||||
Default: None
|
||||
None, the client will attempt to infer the broker version by probing
|
||||
various APIs. Example: (0, 10, 2). Default: None
|
||||
api_version_auto_timeout_ms (int): number of milliseconds to throw a
|
||||
timeout exception from the constructor when checking the broker
|
||||
api version. Only applies if api_version set to None.
|
||||
Default: 2000
|
||||
metric_reporters (list): A list of classes to use as metrics reporters.
|
||||
Implementing the AbstractMetricsReporter interface allows plugging
|
||||
in classes that will be notified of new metric creation. Default: []
|
||||
metrics_enabled (bool): Whether to track metrics on this instance. Default True.
|
||||
metrics_num_samples (int): The number of samples maintained to compute
|
||||
metrics. Default: 2
|
||||
metrics_sample_window_ms (int): The maximum age in milliseconds of
|
||||
@@ -349,42 +274,33 @@ class KafkaProducer(object):
|
||||
Required if sasl_mechanism is PLAIN or one of the SCRAM mechanisms.
|
||||
sasl_plain_password (str): password for sasl PLAIN and SCRAM authentication.
|
||||
Required if sasl_mechanism is PLAIN or one of the SCRAM mechanisms.
|
||||
sasl_kerberos_name (str or gssapi.Name): Constructed gssapi.Name for use with
|
||||
sasl mechanism handshake. If provided, sasl_kerberos_service_name and
|
||||
sasl_kerberos_domain name are ignored. Default: None.
|
||||
sasl_kerberos_service_name (str): Service name to include in GSSAPI
|
||||
sasl mechanism handshake. Default: 'kafka'
|
||||
sasl_kerberos_domain_name (str): kerberos domain name to use in GSSAPI
|
||||
sasl mechanism handshake. Default: one of bootstrap servers
|
||||
sasl_oauth_token_provider (kafka.sasl.oauth.AbstractTokenProvider): OAuthBearer
|
||||
token provider instance. Default: None
|
||||
socks5_proxy (str): Socks5 proxy URL. Default: None
|
||||
kafka_client (callable): Custom class / callable for creating KafkaClient instances
|
||||
sasl_oauth_token_provider (AbstractTokenProvider): OAuthBearer token provider
|
||||
instance. (See kafka.oauth.abstract). Default: None
|
||||
|
||||
Note:
|
||||
Configuration parameters are described in more detail at
|
||||
https://kafka.apache.org/0100/documentation/#producerconfigs
|
||||
https://kafka.apache.org/0100/configuration.html#producerconfigs
|
||||
"""
|
||||
DEFAULT_CONFIG = {
|
||||
'bootstrap_servers': 'localhost',
|
||||
'client_id': None,
|
||||
'key_serializer': None,
|
||||
'value_serializer': None,
|
||||
'enable_idempotence': False,
|
||||
'transactional_id': None,
|
||||
'transaction_timeout_ms': 60000,
|
||||
'delivery_timeout_ms': 120000,
|
||||
'acks': 1,
|
||||
'bootstrap_topics_filter': set(),
|
||||
'compression_type': None,
|
||||
'retries': float('inf'),
|
||||
'retries': 0,
|
||||
'batch_size': 16384,
|
||||
'linger_ms': 0,
|
||||
'partitioner': DefaultPartitioner(),
|
||||
'buffer_memory': 33554432,
|
||||
'connections_max_idle_ms': 9 * 60 * 1000,
|
||||
'max_block_ms': 60000,
|
||||
'max_request_size': 1048576,
|
||||
'allow_auto_create_topics': True,
|
||||
'metadata_max_age_ms': 300000,
|
||||
'retry_backoff_ms': 100,
|
||||
'request_timeout_ms': 30000,
|
||||
@@ -394,7 +310,7 @@ class KafkaProducer(object):
|
||||
'sock_chunk_bytes': 4096, # undocumented experimental option
|
||||
'sock_chunk_buffer_count': 1000, # undocumented experimental option
|
||||
'reconnect_backoff_ms': 50,
|
||||
'reconnect_backoff_max_ms': 30000,
|
||||
'reconnect_backoff_max_ms': 1000,
|
||||
'max_in_flight_requests_per_connection': 5,
|
||||
'security_protocol': 'PLAINTEXT',
|
||||
'ssl_context': None,
|
||||
@@ -408,23 +324,17 @@ class KafkaProducer(object):
|
||||
'api_version': None,
|
||||
'api_version_auto_timeout_ms': 2000,
|
||||
'metric_reporters': [],
|
||||
'metrics_enabled': True,
|
||||
'metrics_num_samples': 2,
|
||||
'metrics_sample_window_ms': 30000,
|
||||
'selector': selectors.DefaultSelector,
|
||||
'sasl_mechanism': None,
|
||||
'sasl_plain_username': None,
|
||||
'sasl_plain_password': None,
|
||||
'sasl_kerberos_name': None,
|
||||
'sasl_kerberos_service_name': 'kafka',
|
||||
'sasl_kerberos_domain_name': None,
|
||||
'sasl_oauth_token_provider': None,
|
||||
'socks5_proxy': None,
|
||||
'kafka_client': KafkaClient,
|
||||
'sasl_oauth_token_provider': None
|
||||
}
|
||||
|
||||
DEPRECATED_CONFIGS = ('buffer_memory',)
|
||||
|
||||
_COMPRESSORS = {
|
||||
'gzip': (has_gzip, LegacyRecordBatchBuilder.CODEC_GZIP),
|
||||
'snappy': (has_snappy, LegacyRecordBatchBuilder.CODEC_SNAPPY),
|
||||
@@ -434,17 +344,12 @@ class KafkaProducer(object):
|
||||
}
|
||||
|
||||
def __init__(self, **configs):
|
||||
log.debug("Starting the Kafka producer") # trace
|
||||
self.config = copy.copy(self.DEFAULT_CONFIG)
|
||||
user_provided_configs = set(configs.keys())
|
||||
for key in self.config:
|
||||
if key in configs:
|
||||
self.config[key] = configs.pop(key)
|
||||
|
||||
for key in self.DEPRECATED_CONFIGS:
|
||||
if key in configs:
|
||||
configs.pop(key)
|
||||
warnings.warn('Deprecated Producer config: %s' % (key,), DeprecationWarning)
|
||||
|
||||
# Only check for extra config keys in top-level class
|
||||
assert not configs, 'Unrecognized configs: %s' % (configs,)
|
||||
|
||||
@@ -462,35 +367,30 @@ class KafkaProducer(object):
|
||||
self.config['api_version'] = None
|
||||
else:
|
||||
self.config['api_version'] = tuple(map(int, deprecated.split('.')))
|
||||
log.warning('%s: use api_version=%s [tuple] -- "%s" as str is deprecated',
|
||||
str(self), str(self.config['api_version']), deprecated)
|
||||
|
||||
log.debug("%s: Starting Kafka producer", str(self))
|
||||
log.warning('use api_version=%s [tuple] -- "%s" as str is deprecated',
|
||||
str(self.config['api_version']), deprecated)
|
||||
|
||||
# Configure metrics
|
||||
if self.config['metrics_enabled']:
|
||||
metrics_tags = {'client-id': self.config['client_id']}
|
||||
metric_config = MetricConfig(samples=self.config['metrics_num_samples'],
|
||||
time_window_ms=self.config['metrics_sample_window_ms'],
|
||||
tags=metrics_tags)
|
||||
reporters = [reporter() for reporter in self.config['metric_reporters']]
|
||||
self._metrics = Metrics(metric_config, reporters)
|
||||
else:
|
||||
self._metrics = None
|
||||
metrics_tags = {'client-id': self.config['client_id']}
|
||||
metric_config = MetricConfig(samples=self.config['metrics_num_samples'],
|
||||
time_window_ms=self.config['metrics_sample_window_ms'],
|
||||
tags=metrics_tags)
|
||||
reporters = [reporter() for reporter in self.config['metric_reporters']]
|
||||
self._metrics = Metrics(metric_config, reporters)
|
||||
|
||||
client = self.config['kafka_client'](
|
||||
metrics=self._metrics, metric_group_prefix='producer',
|
||||
wakeup_timeout_ms=self.config['max_block_ms'],
|
||||
**self.config)
|
||||
client = KafkaClient(metrics=self._metrics, metric_group_prefix='producer',
|
||||
wakeup_timeout_ms=self.config['max_block_ms'],
|
||||
**self.config)
|
||||
|
||||
# Get auto-discovered / normalized version from client
|
||||
self.config['api_version'] = client.config['api_version']
|
||||
# Get auto-discovered version from client if necessary
|
||||
if self.config['api_version'] is None:
|
||||
self.config['api_version'] = client.config['api_version']
|
||||
|
||||
if self.config['compression_type'] == 'lz4':
|
||||
assert self.config['api_version'] >= (0, 8, 2), 'LZ4 Requires >= Kafka 0.8.2 Brokers'
|
||||
|
||||
if self.config['compression_type'] == 'zstd':
|
||||
assert self.config['api_version'] >= (2, 1), 'Zstd Requires >= Kafka 2.1 Brokers'
|
||||
assert self.config['api_version'] >= (2, 1, 0), 'Zstd Requires >= Kafka 2.1.0 Brokers'
|
||||
|
||||
# Check compression_type for library support
|
||||
ct = self.config['compression_type']
|
||||
@@ -501,58 +401,12 @@ class KafkaProducer(object):
|
||||
assert checker(), "Libraries for {} compression codec not found".format(ct)
|
||||
self.config['compression_attrs'] = compression_attrs
|
||||
|
||||
message_version = self._max_usable_produce_magic()
|
||||
self._accumulator = RecordAccumulator(message_version=message_version, metrics=self._metrics, **self.config)
|
||||
self._metadata = client.cluster
|
||||
self._transaction_manager = None
|
||||
self._init_transactions_result = None
|
||||
if 'enable_idempotence' in user_provided_configs and not self.config['enable_idempotence'] and self.config['transactional_id']:
|
||||
raise Errors.KafkaConfigurationError("Cannot set transactional_id without enable_idempotence.")
|
||||
|
||||
if self.config['transactional_id']:
|
||||
self.config['enable_idempotence'] = True
|
||||
|
||||
if self.config['enable_idempotence']:
|
||||
assert self.config['api_version'] >= (0, 11), "Transactional/Idempotent producer requires >= Kafka 0.11 Brokers"
|
||||
|
||||
self._transaction_manager = TransactionManager(
|
||||
transactional_id=self.config['transactional_id'],
|
||||
transaction_timeout_ms=self.config['transaction_timeout_ms'],
|
||||
retry_backoff_ms=self.config['retry_backoff_ms'],
|
||||
api_version=self.config['api_version'],
|
||||
metadata=self._metadata,
|
||||
)
|
||||
if self._transaction_manager.is_transactional():
|
||||
log.info("%s: Instantiated a transactional producer.", str(self))
|
||||
else:
|
||||
log.info("%s: Instantiated an idempotent producer.", str(self))
|
||||
|
||||
if self.config['retries'] == 0:
|
||||
raise Errors.KafkaConfigurationError("Must set 'retries' to non-zero when using the idempotent producer.")
|
||||
|
||||
if 'max_in_flight_requests_per_connection' not in user_provided_configs:
|
||||
log.info("%s: Overriding the default 'max_in_flight_requests_per_connection' to 1 since idempontence is enabled.", str(self))
|
||||
self.config['max_in_flight_requests_per_connection'] = 1
|
||||
elif self.config['max_in_flight_requests_per_connection'] != 1:
|
||||
raise Errors.KafkaConfigurationError("Must set 'max_in_flight_requests_per_connection' to 1 in order"
|
||||
" to use the idempotent producer."
|
||||
" Otherwise we cannot guarantee idempotence.")
|
||||
|
||||
if 'acks' not in user_provided_configs:
|
||||
log.info("%s: Overriding the default 'acks' config to 'all' since idempotence is enabled", str(self))
|
||||
self.config['acks'] = -1
|
||||
elif self.config['acks'] != -1:
|
||||
raise Errors.KafkaConfigurationError("Must set 'acks' config to 'all' in order to use the idempotent"
|
||||
" producer. Otherwise we cannot guarantee idempotence")
|
||||
|
||||
message_version = self.max_usable_produce_magic(self.config['api_version'])
|
||||
self._accumulator = RecordAccumulator(
|
||||
transaction_manager=self._transaction_manager,
|
||||
message_version=message_version,
|
||||
**self.config)
|
||||
guarantee_message_order = bool(self.config['max_in_flight_requests_per_connection'] == 1)
|
||||
self._sender = Sender(client, self._metadata,
|
||||
self._accumulator,
|
||||
metrics=self._metrics,
|
||||
transaction_manager=self._transaction_manager,
|
||||
self._accumulator, self._metrics,
|
||||
guarantee_message_order=guarantee_message_order,
|
||||
**self.config)
|
||||
self._sender.daemon = True
|
||||
@@ -561,7 +415,7 @@ class KafkaProducer(object):
|
||||
|
||||
self._cleanup = self._cleanup_factory()
|
||||
atexit.register(self._cleanup)
|
||||
log.debug("%s: Kafka producer started", str(self))
|
||||
log.debug("Kafka producer started")
|
||||
|
||||
def bootstrap_connected(self):
|
||||
"""Return True if the bootstrap is connected."""
|
||||
@@ -572,7 +426,7 @@ class KafkaProducer(object):
|
||||
_self = weakref.proxy(self)
|
||||
def wrapper():
|
||||
try:
|
||||
_self.close(timeout=0, null_logger=True)
|
||||
_self.close(timeout=0)
|
||||
except (ReferenceError, AttributeError):
|
||||
pass
|
||||
return wrapper
|
||||
@@ -595,28 +449,28 @@ class KafkaProducer(object):
|
||||
self._cleanup = None
|
||||
|
||||
def __del__(self):
|
||||
self.close(timeout=1, null_logger=True)
|
||||
# Disable logger during destruction to avoid touching dangling references
|
||||
class NullLogger(object):
|
||||
def __getattr__(self, name):
|
||||
return lambda *args: None
|
||||
|
||||
def close(self, timeout=None, null_logger=False):
|
||||
global log
|
||||
log = NullLogger()
|
||||
|
||||
self.close()
|
||||
|
||||
def close(self, timeout=None):
|
||||
"""Close this producer.
|
||||
|
||||
Arguments:
|
||||
timeout (float, optional): timeout in seconds to wait for completion.
|
||||
"""
|
||||
if null_logger:
|
||||
# Disable logger during destruction to avoid touching dangling references
|
||||
class NullLogger(object):
|
||||
def __getattr__(self, name):
|
||||
return lambda *args: None
|
||||
|
||||
global log
|
||||
log = NullLogger()
|
||||
|
||||
# drop our atexit handler now to avoid leaks
|
||||
self._unregister_cleanup()
|
||||
|
||||
if not hasattr(self, '_closed') or self._closed:
|
||||
log.info('%s: Kafka producer closed', str(self))
|
||||
log.info('Kafka producer closed')
|
||||
return
|
||||
if timeout is None:
|
||||
# threading.TIMEOUT_MAX is available in Python3.3+
|
||||
@@ -626,16 +480,15 @@ class KafkaProducer(object):
|
||||
else:
|
||||
assert timeout >= 0
|
||||
|
||||
log.info("%s: Closing the Kafka producer with %s secs timeout.", str(self), timeout)
|
||||
self.flush(timeout)
|
||||
log.info("Closing the Kafka producer with %s secs timeout.", timeout)
|
||||
invoked_from_callback = bool(threading.current_thread() is self._sender)
|
||||
if timeout > 0:
|
||||
if invoked_from_callback:
|
||||
log.warning("%s: Overriding close timeout %s secs to 0 in order to"
|
||||
log.warning("Overriding close timeout %s secs to 0 in order to"
|
||||
" prevent useless blocking due to self-join. This"
|
||||
" means you have incorrectly invoked close with a"
|
||||
" non-zero timeout from the producer call-back.",
|
||||
str(self), timeout)
|
||||
timeout)
|
||||
else:
|
||||
# Try to close gracefully.
|
||||
if self._sender is not None:
|
||||
@@ -643,13 +496,12 @@ class KafkaProducer(object):
|
||||
self._sender.join(timeout)
|
||||
|
||||
if self._sender is not None and self._sender.is_alive():
|
||||
log.info("%s: Proceeding to force close the producer since pending"
|
||||
log.info("Proceeding to force close the producer since pending"
|
||||
" requests could not be completed within timeout %s.",
|
||||
str(self), timeout)
|
||||
timeout)
|
||||
self._sender.force_close()
|
||||
|
||||
if self._metrics:
|
||||
self._metrics.close()
|
||||
self._metrics.close()
|
||||
try:
|
||||
self.config['key_serializer'].close()
|
||||
except AttributeError:
|
||||
@@ -659,23 +511,23 @@ class KafkaProducer(object):
|
||||
except AttributeError:
|
||||
pass
|
||||
self._closed = True
|
||||
log.debug("%s: The Kafka producer has closed.", str(self))
|
||||
log.debug("The Kafka producer has closed.")
|
||||
|
||||
def partitions_for(self, topic):
|
||||
"""Returns set of all known partitions for the topic."""
|
||||
return self._wait_on_metadata(topic, self.config['max_block_ms'])
|
||||
max_wait = self.config['max_block_ms'] / 1000.0
|
||||
return self._wait_on_metadata(topic, max_wait)
|
||||
|
||||
@classmethod
|
||||
def max_usable_produce_magic(cls, api_version):
|
||||
if api_version >= (0, 11):
|
||||
def _max_usable_produce_magic(self):
|
||||
if self.config['api_version'] >= (0, 11):
|
||||
return 2
|
||||
elif api_version >= (0, 10, 0):
|
||||
elif self.config['api_version'] >= (0, 10):
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
|
||||
def _estimate_size_in_bytes(self, key, value, headers=[]):
|
||||
magic = self.max_usable_produce_magic(self.config['api_version'])
|
||||
magic = self._max_usable_produce_magic()
|
||||
if magic == 2:
|
||||
return DefaultRecordBatchBuilder.estimate_size_in_bytes(
|
||||
key, value, headers)
|
||||
@@ -683,114 +535,6 @@ class KafkaProducer(object):
|
||||
return LegacyRecordBatchBuilder.estimate_size_in_bytes(
|
||||
magic, self.config['compression_type'], key, value)
|
||||
|
||||
def init_transactions(self):
|
||||
"""
|
||||
Needs to be called before any other methods when the transactional.id is set in the configuration.
|
||||
|
||||
This method does the following:
|
||||
1. Ensures any transactions initiated by previous instances of the producer with the same
|
||||
transactional_id are completed. If the previous instance had failed with a transaction in
|
||||
progress, it will be aborted. If the last transaction had begun completion,
|
||||
but not yet finished, this method awaits its completion.
|
||||
2. Gets the internal producer id and epoch, used in all future transactional
|
||||
messages issued by the producer.
|
||||
|
||||
Note that this method will raise KafkaTimeoutError if the transactional state cannot
|
||||
be initialized before expiration of `max_block_ms`.
|
||||
|
||||
Retrying after a KafkaTimeoutError will continue to wait for the prior request to succeed or fail.
|
||||
Retrying after any other exception will start a new initialization attempt.
|
||||
Retrying after a successful initialization will do nothing.
|
||||
|
||||
Raises:
|
||||
IllegalStateError: if no transactional_id has been configured
|
||||
AuthorizationError: fatal error indicating that the configured
|
||||
transactional_id is not authorized.
|
||||
KafkaError: if the producer has encountered a previous fatal error or for any other unexpected error
|
||||
KafkaTimeoutError: if the time taken for initialize the transaction has surpassed `max.block.ms`.
|
||||
"""
|
||||
if not self._transaction_manager:
|
||||
raise Errors.IllegalStateError("Cannot call init_transactions without setting a transactional_id.")
|
||||
if self._init_transactions_result is None:
|
||||
self._init_transactions_result = self._transaction_manager.initialize_transactions()
|
||||
self._sender.wakeup()
|
||||
|
||||
try:
|
||||
if not self._init_transactions_result.wait(timeout_ms=self.config['max_block_ms']):
|
||||
raise Errors.KafkaTimeoutError("Timeout expired while initializing transactional state in %s ms." % (self.config['max_block_ms'],))
|
||||
finally:
|
||||
if self._init_transactions_result.failed:
|
||||
self._init_transactions_result = None
|
||||
|
||||
def begin_transaction(self):
|
||||
""" Should be called before the start of each new transaction.
|
||||
|
||||
Note that prior to the first invocation of this method,
|
||||
you must invoke `init_transactions()` exactly one time.
|
||||
|
||||
Raises:
|
||||
ProducerFencedError if another producer is with the same
|
||||
transactional_id is active.
|
||||
"""
|
||||
# Set the transactional bit in the producer.
|
||||
if not self._transaction_manager:
|
||||
raise Errors.IllegalStateError("Cannot use transactional methods without enabling transactions")
|
||||
self._transaction_manager.begin_transaction()
|
||||
|
||||
def send_offsets_to_transaction(self, offsets, consumer_group_id):
|
||||
"""
|
||||
Sends a list of consumed offsets to the consumer group coordinator, and also marks
|
||||
those offsets as part of the current transaction. These offsets will be considered
|
||||
consumed only if the transaction is committed successfully.
|
||||
|
||||
This method should be used when you need to batch consumed and produced messages
|
||||
together, typically in a consume-transform-produce pattern.
|
||||
|
||||
Arguments:
|
||||
offsets ({TopicPartition: OffsetAndMetadata}): map of topic-partition -> offsets to commit
|
||||
as part of current transaction.
|
||||
consumer_group_id (str): Name of consumer group for offsets commit.
|
||||
|
||||
Raises:
|
||||
IllegalStateError: if no transactional_id, or transaction has not been started.
|
||||
ProducerFencedError: fatal error indicating another producer with the same transactional_id is active.
|
||||
UnsupportedVersionError: fatal error indicating the broker does not support transactions (i.e. if < 0.11).
|
||||
UnsupportedForMessageFormatError: fatal error indicating the message format used for the offsets
|
||||
topic on the broker does not support transactions.
|
||||
AuthorizationError: fatal error indicating that the configured transactional_id is not authorized.
|
||||
KafkaErro:r if the producer has encountered a previous fatal or abortable error, or for any
|
||||
other unexpected error
|
||||
"""
|
||||
if not self._transaction_manager:
|
||||
raise Errors.IllegalStateError("Cannot use transactional methods without enabling transactions")
|
||||
result = self._transaction_manager.send_offsets_to_transaction(offsets, consumer_group_id)
|
||||
self._sender.wakeup()
|
||||
result.wait()
|
||||
|
||||
def commit_transaction(self):
|
||||
""" Commits the ongoing transaction.
|
||||
|
||||
Raises: ProducerFencedError if another producer with the same
|
||||
transactional_id is active.
|
||||
"""
|
||||
if not self._transaction_manager:
|
||||
raise Errors.IllegalStateError("Cannot commit transaction since transactions are not enabled")
|
||||
result = self._transaction_manager.begin_commit()
|
||||
self._sender.wakeup()
|
||||
result.wait()
|
||||
|
||||
def abort_transaction(self):
|
||||
""" Aborts the ongoing transaction.
|
||||
|
||||
Raises: ProducerFencedError if another producer with the same
|
||||
transactional_id is active.
|
||||
"""
|
||||
if not self._transaction_manager:
|
||||
raise Errors.IllegalStateError("Cannot abort transaction since transactions are not enabled.")
|
||||
result = self._transaction_manager.begin_abort()
|
||||
self._sender.wakeup()
|
||||
result.wait()
|
||||
|
||||
def send(self, topic, value=None, key=None, headers=None, partition=None, timestamp_ms=None):
|
||||
"""Publish a message to a topic.
|
||||
|
||||
@@ -823,58 +567,44 @@ class KafkaProducer(object):
|
||||
Raises:
|
||||
KafkaTimeoutError: if unable to fetch topic metadata, or unable
|
||||
to obtain memory buffer prior to configured max_block_ms
|
||||
TypeError: if topic is not a string
|
||||
ValueError: if topic is invalid: must be chars (a-zA-Z0-9._-), and less than 250 length
|
||||
AssertionError: if KafkaProducer is closed, or key and value are both None
|
||||
"""
|
||||
assert not self._closed, 'KafkaProducer already closed!'
|
||||
assert value is not None or self.config['api_version'] >= (0, 8, 1), (
|
||||
'Null messages require kafka >= 0.8.1')
|
||||
assert not (value is None and key is None), 'Need at least one: key or value'
|
||||
ensure_valid_topic_name(topic)
|
||||
key_bytes = value_bytes = None
|
||||
timer = Timer(self.config['max_block_ms'], "Failed to assign partition for message in max_block_ms.")
|
||||
try:
|
||||
assigned_partition = None
|
||||
while assigned_partition is None and not timer.expired:
|
||||
self._wait_on_metadata(topic, timer.timeout_ms)
|
||||
self._wait_on_metadata(topic, self.config['max_block_ms'] / 1000.0)
|
||||
|
||||
key_bytes = self._serialize(
|
||||
self.config['key_serializer'],
|
||||
topic, key)
|
||||
value_bytes = self._serialize(
|
||||
self.config['value_serializer'],
|
||||
topic, value)
|
||||
assert type(key_bytes) in (bytes, bytearray, memoryview, type(None))
|
||||
assert type(value_bytes) in (bytes, bytearray, memoryview, type(None))
|
||||
key_bytes = self._serialize(
|
||||
self.config['key_serializer'],
|
||||
topic, key)
|
||||
value_bytes = self._serialize(
|
||||
self.config['value_serializer'],
|
||||
topic, value)
|
||||
assert type(key_bytes) in (bytes, bytearray, memoryview, type(None))
|
||||
assert type(value_bytes) in (bytes, bytearray, memoryview, type(None))
|
||||
|
||||
assigned_partition = self._partition(topic, partition, key, value,
|
||||
key_bytes, value_bytes)
|
||||
if assigned_partition is None:
|
||||
raise Errors.KafkaTimeoutError("Failed to assign partition for message after %s secs." % timer.elapsed_ms / 1000)
|
||||
else:
|
||||
partition = assigned_partition
|
||||
partition = self._partition(topic, partition, key, value,
|
||||
key_bytes, value_bytes)
|
||||
|
||||
if headers is None:
|
||||
headers = []
|
||||
assert isinstance(headers, list)
|
||||
assert all(isinstance(item, tuple) and len(item) == 2 and isinstance(item[0], str) and isinstance(item[1], bytes) for item in headers)
|
||||
assert type(headers) == list
|
||||
assert all(type(item) == tuple and len(item) == 2 and type(item[0]) == str and type(item[1]) == bytes for item in headers)
|
||||
|
||||
message_size = self._estimate_size_in_bytes(key_bytes, value_bytes, headers)
|
||||
self._ensure_valid_record_size(message_size)
|
||||
|
||||
tp = TopicPartition(topic, partition)
|
||||
log.debug("%s: Sending (key=%r value=%r headers=%r) to %s", str(self), key, value, headers, tp)
|
||||
|
||||
if self._transaction_manager and self._transaction_manager.is_transactional():
|
||||
self._transaction_manager.maybe_add_partition_to_transaction(tp)
|
||||
|
||||
log.debug("Sending (key=%r value=%r headers=%r) to %s", key, value, headers, tp)
|
||||
result = self._accumulator.append(tp, timestamp_ms,
|
||||
key_bytes, value_bytes, headers)
|
||||
key_bytes, value_bytes, headers,
|
||||
self.config['max_block_ms'],
|
||||
estimated_size=message_size)
|
||||
future, batch_is_full, new_batch_created = result
|
||||
if batch_is_full or new_batch_created:
|
||||
log.debug("%s: Waking up the sender since %s is either full or"
|
||||
" getting a new batch", str(self), tp)
|
||||
log.debug("Waking up the sender since %s is either full or"
|
||||
" getting a new batch", tp)
|
||||
self._sender.wakeup()
|
||||
|
||||
return future
|
||||
@@ -882,7 +612,7 @@ class KafkaProducer(object):
|
||||
# for API exceptions return them in the future,
|
||||
# for other exceptions raise directly
|
||||
except Errors.BrokerResponseError as e:
|
||||
log.error("%s: Exception occurred during message send: %s", str(self), e)
|
||||
log.debug("Exception occurred during message send: %s", e)
|
||||
return FutureRecordMetadata(
|
||||
FutureProduceResult(TopicPartition(topic, partition)),
|
||||
-1, None, None,
|
||||
@@ -913,7 +643,7 @@ class KafkaProducer(object):
|
||||
KafkaTimeoutError: failure to flush buffered records within the
|
||||
provided timeout
|
||||
"""
|
||||
log.debug("%s: Flushing accumulated records in producer.", str(self))
|
||||
log.debug("Flushing accumulated records in producer.") # trace
|
||||
self._accumulator.begin_flush()
|
||||
self._sender.wakeup()
|
||||
self._accumulator.await_flush_completion(timeout=timeout)
|
||||
@@ -925,8 +655,13 @@ class KafkaProducer(object):
|
||||
"The message is %d bytes when serialized which is larger than"
|
||||
" the maximum request size you have configured with the"
|
||||
" max_request_size configuration" % (size,))
|
||||
if size > self.config['buffer_memory']:
|
||||
raise Errors.MessageSizeTooLargeError(
|
||||
"The message is %d bytes when serialized which is larger than"
|
||||
" the total memory buffer you have configured with the"
|
||||
" buffer_memory configuration." % (size,))
|
||||
|
||||
def _wait_on_metadata(self, topic, max_wait_ms):
|
||||
def _wait_on_metadata(self, topic, max_wait):
|
||||
"""
|
||||
Wait for cluster metadata including partitions for the given topic to
|
||||
be available.
|
||||
@@ -944,31 +679,32 @@ class KafkaProducer(object):
|
||||
"""
|
||||
# add topic to metadata topic list if it is not there already.
|
||||
self._sender.add_topic(topic)
|
||||
timer = Timer(max_wait_ms, "Failed to update metadata after %.1f secs." % (max_wait_ms / 1000,))
|
||||
begin = time.time()
|
||||
elapsed = 0.0
|
||||
metadata_event = None
|
||||
while True:
|
||||
partitions = self._metadata.partitions_for_topic(topic)
|
||||
if partitions is not None:
|
||||
return partitions
|
||||
timer.maybe_raise()
|
||||
|
||||
if not metadata_event:
|
||||
metadata_event = threading.Event()
|
||||
|
||||
log.debug("%s: Requesting metadata update for topic %s", str(self), topic)
|
||||
log.debug("Requesting metadata update for topic %s", topic)
|
||||
|
||||
metadata_event.clear()
|
||||
future = self._metadata.request_update()
|
||||
future.add_both(lambda e, *args: e.set(), metadata_event)
|
||||
self._sender.wakeup()
|
||||
metadata_event.wait(timer.timeout_ms / 1000)
|
||||
if not future.is_done:
|
||||
metadata_event.wait(max_wait - elapsed)
|
||||
elapsed = time.time() - begin
|
||||
if not metadata_event.is_set():
|
||||
raise Errors.KafkaTimeoutError(
|
||||
"Failed to update metadata after %.1f secs." % (max_wait_ms / 1000,))
|
||||
elif future.failed() and not future.retriable():
|
||||
raise future.exception
|
||||
"Failed to update metadata after %.1f secs." % (max_wait,))
|
||||
elif topic in self._metadata.unauthorized_topics:
|
||||
raise Errors.TopicAuthorizationFailedError(set([topic]))
|
||||
raise Errors.TopicAuthorizationFailedError(topic)
|
||||
else:
|
||||
log.debug("%s: _wait_on_metadata woke after %s secs.", str(self), timer.elapsed_ms / 1000)
|
||||
log.debug("_wait_on_metadata woke after %s secs.", elapsed)
|
||||
|
||||
def _serialize(self, f, topic, data):
|
||||
if not f:
|
||||
@@ -979,18 +715,16 @@ class KafkaProducer(object):
|
||||
|
||||
def _partition(self, topic, partition, key, value,
|
||||
serialized_key, serialized_value):
|
||||
all_partitions = self._metadata.partitions_for_topic(topic)
|
||||
available = self._metadata.available_partitions_for_topic(topic)
|
||||
if all_partitions is None or available is None:
|
||||
return None
|
||||
if partition is not None:
|
||||
assert partition >= 0
|
||||
assert partition in all_partitions, 'Unrecognized partition'
|
||||
assert partition in self._metadata.partitions_for_topic(topic), 'Unrecognized partition'
|
||||
return partition
|
||||
|
||||
all_partitions = sorted(self._metadata.partitions_for_topic(topic))
|
||||
available = list(self._metadata.available_partitions_for_topic(topic))
|
||||
return self.config['partitioner'](serialized_key,
|
||||
sorted(all_partitions),
|
||||
list(available))
|
||||
all_partitions,
|
||||
available)
|
||||
|
||||
def metrics(self, raw=False):
|
||||
"""Get metrics on producer performance.
|
||||
@@ -1002,8 +736,6 @@ class KafkaProducer(object):
|
||||
This is an unstable interface. It may change in future
|
||||
releases without warning.
|
||||
"""
|
||||
if not self._metrics:
|
||||
return
|
||||
if raw:
|
||||
return self._metrics.metrics.copy()
|
||||
|
||||
@@ -1015,6 +747,3 @@ class KafkaProducer(object):
|
||||
metrics[k.group][k.name] = {}
|
||||
metrics[k.group][k.name] = v.value()
|
||||
return metrics
|
||||
|
||||
def __str__(self):
|
||||
return "<KafkaProducer client_id=%s transactional_id=%s>" % (self.config['client_id'], self.config['transactional_id'])
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from __future__ import absolute_import, division
|
||||
from __future__ import absolute_import
|
||||
|
||||
import collections
|
||||
import copy
|
||||
@@ -6,14 +6,8 @@ import logging
|
||||
import threading
|
||||
import time
|
||||
|
||||
try:
|
||||
# enum in stdlib as of py3.4
|
||||
from enum import IntEnum # pylint: disable=import-error
|
||||
except ImportError:
|
||||
# vendored backport module
|
||||
from kafka.vendor.enum34 import IntEnum
|
||||
|
||||
import kafka.errors as Errors
|
||||
from kafka.producer.buffer import SimpleBufferPool
|
||||
from kafka.producer.future import FutureRecordMetadata, FutureProduceResult
|
||||
from kafka.record.memory_records import MemoryRecordsBuilder
|
||||
from kafka.structs import TopicPartition
|
||||
@@ -41,16 +35,10 @@ class AtomicInteger(object):
|
||||
return self._val
|
||||
|
||||
|
||||
class FinalState(IntEnum):
|
||||
ABORTED = 0
|
||||
FAILED = 1
|
||||
SUCCEEDED = 2
|
||||
|
||||
|
||||
class ProducerBatch(object):
|
||||
def __init__(self, tp, records, now=None):
|
||||
now = time.time() if now is None else now
|
||||
def __init__(self, tp, records, buffer):
|
||||
self.max_record_size = 0
|
||||
now = time.time()
|
||||
self.created = now
|
||||
self.drained = None
|
||||
self.attempts = 0
|
||||
@@ -60,120 +48,81 @@ class ProducerBatch(object):
|
||||
self.topic_partition = tp
|
||||
self.produce_future = FutureProduceResult(tp)
|
||||
self._retry = False
|
||||
self._final_state = None
|
||||
|
||||
@property
|
||||
def final_state(self):
|
||||
return self._final_state
|
||||
self._buffer = buffer # We only save it, we don't write to it
|
||||
|
||||
@property
|
||||
def record_count(self):
|
||||
return self.records.next_offset()
|
||||
|
||||
@property
|
||||
def producer_id(self):
|
||||
return self.records.producer_id if self.records else None
|
||||
|
||||
@property
|
||||
def producer_epoch(self):
|
||||
return self.records.producer_epoch if self.records else None
|
||||
|
||||
@property
|
||||
def has_sequence(self):
|
||||
return self.records.has_sequence if self.records else False
|
||||
|
||||
def try_append(self, timestamp_ms, key, value, headers, now=None):
|
||||
def try_append(self, timestamp_ms, key, value, headers):
|
||||
metadata = self.records.append(timestamp_ms, key, value, headers)
|
||||
if metadata is None:
|
||||
return None
|
||||
|
||||
now = time.time() if now is None else now
|
||||
self.max_record_size = max(self.max_record_size, metadata.size)
|
||||
self.last_append = now
|
||||
future = FutureRecordMetadata(
|
||||
self.produce_future,
|
||||
metadata.offset,
|
||||
metadata.timestamp,
|
||||
metadata.crc,
|
||||
len(key) if key is not None else -1,
|
||||
len(value) if value is not None else -1,
|
||||
sum(len(h_key.encode("utf-8")) + len(h_val) for h_key, h_val in headers) if headers else -1)
|
||||
self.last_append = time.time()
|
||||
future = FutureRecordMetadata(self.produce_future, metadata.offset,
|
||||
metadata.timestamp, metadata.crc,
|
||||
len(key) if key is not None else -1,
|
||||
len(value) if value is not None else -1,
|
||||
sum(len(h_key.encode("utf-8")) + len(h_val) for h_key, h_val in headers) if headers else -1)
|
||||
return future
|
||||
|
||||
def abort(self, exception):
|
||||
"""Abort the batch and complete the future and callbacks."""
|
||||
if self._final_state is not None:
|
||||
raise Errors.IllegalStateError("Batch has already been completed in final state: %s" % self._final_state)
|
||||
self._final_state = FinalState.ABORTED
|
||||
|
||||
log.debug("Aborting batch for partition %s: %s", self.topic_partition, exception)
|
||||
self._complete_future(-1, -1, exception)
|
||||
|
||||
def done(self, base_offset=None, timestamp_ms=None, exception=None):
|
||||
"""
|
||||
Finalize the state of a batch. Final state, once set, is immutable. This function may be called
|
||||
once or twice on a batch. It may be called twice if
|
||||
1. An inflight batch expires before a response from the broker is received. The batch's final
|
||||
state is set to FAILED. But it could succeed on the broker and second time around batch.done() may
|
||||
try to set SUCCEEDED final state.
|
||||
|
||||
2. If a transaction abortion happens or if the producer is closed forcefully, the final state is
|
||||
ABORTED but again it could succeed if broker responds with a success.
|
||||
|
||||
Attempted transitions from [FAILED | ABORTED] --> SUCCEEDED are logged.
|
||||
Attempted transitions from one failure state to the same or a different failed state are ignored.
|
||||
Attempted transitions from SUCCEEDED to the same or a failed state throw an exception.
|
||||
"""
|
||||
final_state = FinalState.SUCCEEDED if exception is None else FinalState.FAILED
|
||||
if self._final_state is None:
|
||||
self._final_state = final_state
|
||||
if final_state is FinalState.SUCCEEDED:
|
||||
log.debug("Successfully produced messages to %s with base offset %s", self.topic_partition, base_offset)
|
||||
else:
|
||||
log.warning("Failed to produce messages to topic-partition %s with base offset %s: %s",
|
||||
self.topic_partition, base_offset, exception)
|
||||
self._complete_future(base_offset, timestamp_ms, exception)
|
||||
return True
|
||||
|
||||
elif self._final_state is not FinalState.SUCCEEDED:
|
||||
if final_state is FinalState.SUCCEEDED:
|
||||
# Log if a previously unsuccessful batch succeeded later on.
|
||||
log.debug("ProduceResponse returned %s for %s after batch with base offset %s had already been %s.",
|
||||
final_state, self.topic_partition, base_offset, self._final_state)
|
||||
else:
|
||||
# FAILED --> FAILED and ABORTED --> FAILED transitions are ignored.
|
||||
log.debug("Ignored state transition %s -> %s for %s batch with base offset %s",
|
||||
self._final_state, final_state, self.topic_partition, base_offset)
|
||||
else:
|
||||
# A SUCCESSFUL batch must not attempt another state change.
|
||||
raise Errors.IllegalStateError("A %s batch must not attempt another state change to %s" % (self._final_state, final_state))
|
||||
return False
|
||||
|
||||
def _complete_future(self, base_offset, timestamp_ms, exception):
|
||||
def done(self, base_offset=None, timestamp_ms=None, exception=None, log_start_offset=None, global_error=None):
|
||||
level = logging.DEBUG if exception is None else logging.WARNING
|
||||
log.log(level, "Produced messages to topic-partition %s with base offset"
|
||||
" %s log start offset %s and error %s.", self.topic_partition, base_offset,
|
||||
log_start_offset, global_error) # trace
|
||||
if self.produce_future.is_done:
|
||||
raise Errors.IllegalStateError('Batch is already closed!')
|
||||
log.warning('Batch is already closed -- ignoring batch.done()')
|
||||
return
|
||||
elif exception is None:
|
||||
self.produce_future.success((base_offset, timestamp_ms))
|
||||
self.produce_future.success((base_offset, timestamp_ms, log_start_offset))
|
||||
else:
|
||||
self.produce_future.failure(exception)
|
||||
|
||||
def has_reached_delivery_timeout(self, delivery_timeout_ms, now=None):
|
||||
now = time.time() if now is None else now
|
||||
return delivery_timeout_ms / 1000 <= now - self.created
|
||||
def maybe_expire(self, request_timeout_ms, retry_backoff_ms, linger_ms, is_full):
|
||||
"""Expire batches if metadata is not available
|
||||
|
||||
A batch whose metadata is not available should be expired if one
|
||||
of the following is true:
|
||||
|
||||
* the batch is not in retry AND request timeout has elapsed after
|
||||
it is ready (full or linger.ms has reached).
|
||||
|
||||
* the batch is in retry AND request timeout has elapsed after the
|
||||
backoff period ended.
|
||||
"""
|
||||
now = time.time()
|
||||
since_append = now - self.last_append
|
||||
since_ready = now - (self.created + linger_ms / 1000.0)
|
||||
since_backoff = now - (self.last_attempt + retry_backoff_ms / 1000.0)
|
||||
timeout = request_timeout_ms / 1000.0
|
||||
|
||||
error = None
|
||||
if not self.in_retry() and is_full and timeout < since_append:
|
||||
error = "%d seconds have passed since last append" % (since_append,)
|
||||
elif not self.in_retry() and timeout < since_ready:
|
||||
error = "%d seconds have passed since batch creation plus linger time" % (since_ready,)
|
||||
elif self.in_retry() and timeout < since_backoff:
|
||||
error = "%d seconds have passed since last attempt plus backoff time" % (since_backoff,)
|
||||
|
||||
if error:
|
||||
self.records.close()
|
||||
self.done(-1, None, Errors.KafkaTimeoutError(
|
||||
"Batch for %s containing %s record(s) expired: %s" % (
|
||||
self.topic_partition, self.records.next_offset(), error)))
|
||||
return True
|
||||
return False
|
||||
|
||||
def in_retry(self):
|
||||
return self._retry
|
||||
|
||||
def retry(self, now=None):
|
||||
now = time.time() if now is None else now
|
||||
def set_retry(self):
|
||||
self._retry = True
|
||||
self.attempts += 1
|
||||
self.last_attempt = now
|
||||
self.last_append = now
|
||||
|
||||
@property
|
||||
def is_done(self):
|
||||
return self.produce_future.is_done
|
||||
def buffer(self):
|
||||
return self._buffer
|
||||
|
||||
def __str__(self):
|
||||
return 'ProducerBatch(topic_partition=%s, record_count=%d)' % (
|
||||
@@ -194,6 +143,12 @@ class RecordAccumulator(object):
|
||||
A small batch size will make batching less common and may reduce
|
||||
throughput (a batch size of zero will disable batching entirely).
|
||||
Default: 16384
|
||||
buffer_memory (int): The total bytes of memory the producer should use
|
||||
to buffer records waiting to be sent to the server. If records are
|
||||
sent faster than they can be delivered to the server the producer
|
||||
will block up to max_block_ms, raising an exception on timeout.
|
||||
In the current implementation, this setting is an approximation.
|
||||
Default: 33554432 (32MB)
|
||||
compression_attrs (int): The compression type for all data generated by
|
||||
the producer. Valid values are gzip(1), snappy(2), lz4(3), or
|
||||
none(0).
|
||||
@@ -201,7 +156,7 @@ class RecordAccumulator(object):
|
||||
will also impact the compression ratio (more batching means better
|
||||
compression). Default: None.
|
||||
linger_ms (int): An artificial delay time to add before declaring a
|
||||
record batch (that isn't full) ready for sending. This allows
|
||||
messageset (that isn't full) ready for sending. This allows
|
||||
time for more records to arrive. Setting a non-zero linger_ms
|
||||
will trade off some latency for potentially better throughput
|
||||
due to more batching (and hence fewer, larger requests).
|
||||
@@ -211,14 +166,14 @@ class RecordAccumulator(object):
|
||||
all retries in a short period of time. Default: 100
|
||||
"""
|
||||
DEFAULT_CONFIG = {
|
||||
'buffer_memory': 33554432,
|
||||
'batch_size': 16384,
|
||||
'compression_attrs': 0,
|
||||
'linger_ms': 0,
|
||||
'request_timeout_ms': 30000,
|
||||
'delivery_timeout_ms': 120000,
|
||||
'retry_backoff_ms': 100,
|
||||
'transaction_manager': None,
|
||||
'message_version': 2,
|
||||
'message_version': 0,
|
||||
'metrics': None,
|
||||
'metric_group_prefix': 'producer-metrics',
|
||||
}
|
||||
|
||||
def __init__(self, **configs):
|
||||
@@ -228,37 +183,22 @@ class RecordAccumulator(object):
|
||||
self.config[key] = configs.pop(key)
|
||||
|
||||
self._closed = False
|
||||
self._transaction_manager = self.config['transaction_manager']
|
||||
self._flushes_in_progress = AtomicInteger()
|
||||
self._appends_in_progress = AtomicInteger()
|
||||
self._batches = collections.defaultdict(collections.deque) # TopicPartition: [ProducerBatch]
|
||||
self._tp_locks = {None: threading.Lock()} # TopicPartition: Lock, plus a lock to add entries
|
||||
self._free = SimpleBufferPool(self.config['buffer_memory'],
|
||||
self.config['batch_size'],
|
||||
metrics=self.config['metrics'],
|
||||
metric_group_prefix=self.config['metric_group_prefix'])
|
||||
self._incomplete = IncompleteProducerBatches()
|
||||
# The following variables should only be accessed by the sender thread,
|
||||
# so we don't need to protect them w/ locking.
|
||||
self.muted = set()
|
||||
self._drain_index = 0
|
||||
self._next_batch_expiry_time_ms = float('inf')
|
||||
|
||||
if self.config['delivery_timeout_ms'] < self.config['linger_ms'] + self.config['request_timeout_ms']:
|
||||
raise Errors.KafkaConfigurationError("Must set delivery_timeout_ms higher than linger_ms + request_timeout_ms")
|
||||
|
||||
@property
|
||||
def delivery_timeout_ms(self):
|
||||
return self.config['delivery_timeout_ms']
|
||||
|
||||
@property
|
||||
def next_expiry_time_ms(self):
|
||||
return self._next_batch_expiry_time_ms
|
||||
|
||||
def _tp_lock(self, tp):
|
||||
if tp not in self._tp_locks:
|
||||
with self._tp_locks[None]:
|
||||
if tp not in self._tp_locks:
|
||||
self._tp_locks[tp] = threading.Lock()
|
||||
return self._tp_locks[tp]
|
||||
|
||||
def append(self, tp, timestamp_ms, key, value, headers, now=None):
|
||||
def append(self, tp, timestamp_ms, key, value, headers, max_time_to_block_ms,
|
||||
estimated_size=0):
|
||||
"""Add a record to the accumulator, return the append result.
|
||||
|
||||
The append result will contain the future metadata, and flag for
|
||||
@@ -271,53 +211,59 @@ class RecordAccumulator(object):
|
||||
key (bytes): The key for the record
|
||||
value (bytes): The value for the record
|
||||
headers (List[Tuple[str, bytes]]): The header fields for the record
|
||||
max_time_to_block_ms (int): The maximum time in milliseconds to
|
||||
block for buffer memory to be available
|
||||
|
||||
Returns:
|
||||
tuple: (future, batch_is_full, new_batch_created)
|
||||
"""
|
||||
assert isinstance(tp, TopicPartition), 'not TopicPartition'
|
||||
assert not self._closed, 'RecordAccumulator is closed'
|
||||
now = time.time() if now is None else now
|
||||
# We keep track of the number of appending thread to make sure we do
|
||||
# not miss batches in abortIncompleteBatches().
|
||||
self._appends_in_progress.increment()
|
||||
try:
|
||||
with self._tp_lock(tp):
|
||||
if tp not in self._tp_locks:
|
||||
with self._tp_locks[None]:
|
||||
if tp not in self._tp_locks:
|
||||
self._tp_locks[tp] = threading.Lock()
|
||||
|
||||
with self._tp_locks[tp]:
|
||||
# check if we have an in-progress batch
|
||||
dq = self._batches[tp]
|
||||
if dq:
|
||||
last = dq[-1]
|
||||
future = last.try_append(timestamp_ms, key, value, headers, now=now)
|
||||
future = last.try_append(timestamp_ms, key, value, headers)
|
||||
if future is not None:
|
||||
batch_is_full = len(dq) > 1 or last.records.is_full()
|
||||
return future, batch_is_full, False
|
||||
|
||||
with self._tp_lock(tp):
|
||||
size = max(self.config['batch_size'], estimated_size)
|
||||
log.debug("Allocating a new %d byte message buffer for %s", size, tp) # trace
|
||||
buf = self._free.allocate(size, max_time_to_block_ms)
|
||||
with self._tp_locks[tp]:
|
||||
# Need to check if producer is closed again after grabbing the
|
||||
# dequeue lock.
|
||||
assert not self._closed, 'RecordAccumulator is closed'
|
||||
|
||||
if dq:
|
||||
last = dq[-1]
|
||||
future = last.try_append(timestamp_ms, key, value, headers, now=now)
|
||||
future = last.try_append(timestamp_ms, key, value, headers)
|
||||
if future is not None:
|
||||
# Somebody else found us a batch, return the one we
|
||||
# waited for! Hopefully this doesn't happen often...
|
||||
self._free.deallocate(buf)
|
||||
batch_is_full = len(dq) > 1 or last.records.is_full()
|
||||
return future, batch_is_full, False
|
||||
|
||||
if self._transaction_manager and self.config['message_version'] < 2:
|
||||
raise Errors.UnsupportedVersionError("Attempting to use idempotence with a broker which"
|
||||
" does not support the required message format (v2)."
|
||||
" The broker must be version 0.11 or later.")
|
||||
records = MemoryRecordsBuilder(
|
||||
self.config['message_version'],
|
||||
self.config['compression_attrs'],
|
||||
self.config['batch_size']
|
||||
)
|
||||
|
||||
batch = ProducerBatch(tp, records, now=now)
|
||||
future = batch.try_append(timestamp_ms, key, value, headers, now=now)
|
||||
batch = ProducerBatch(tp, records, buf)
|
||||
future = batch.try_append(timestamp_ms, key, value, headers)
|
||||
if not future:
|
||||
raise Exception()
|
||||
|
||||
@@ -328,43 +274,79 @@ class RecordAccumulator(object):
|
||||
finally:
|
||||
self._appends_in_progress.decrement()
|
||||
|
||||
def reset_next_batch_expiry_time(self):
|
||||
self._next_batch_expiry_time_ms = float('inf')
|
||||
def abort_expired_batches(self, request_timeout_ms, cluster):
|
||||
"""Abort the batches that have been sitting in RecordAccumulator for
|
||||
more than the configured request_timeout due to metadata being
|
||||
unavailable.
|
||||
|
||||
def maybe_update_next_batch_expiry_time(self, batch):
|
||||
self._next_batch_expiry_time_ms = min(self._next_batch_expiry_time_ms, batch.created * 1000 + self.delivery_timeout_ms)
|
||||
Arguments:
|
||||
request_timeout_ms (int): milliseconds to timeout
|
||||
cluster (ClusterMetadata): current metadata for kafka cluster
|
||||
|
||||
def expired_batches(self, now=None):
|
||||
"""Get a list of batches which have been sitting in the accumulator too long and need to be expired."""
|
||||
Returns:
|
||||
list of ProducerBatch that were expired
|
||||
"""
|
||||
expired_batches = []
|
||||
to_remove = []
|
||||
count = 0
|
||||
for tp in list(self._batches.keys()):
|
||||
with self._tp_lock(tp):
|
||||
assert tp in self._tp_locks, 'TopicPartition not in locks dict'
|
||||
|
||||
# We only check if the batch should be expired if the partition
|
||||
# does not have a batch in flight. This is to avoid the later
|
||||
# batches get expired when an earlier batch is still in progress.
|
||||
# This protection only takes effect when user sets
|
||||
# max.in.flight.request.per.connection=1. Otherwise the expiration
|
||||
# order is not guranteed.
|
||||
if tp in self.muted:
|
||||
continue
|
||||
|
||||
with self._tp_locks[tp]:
|
||||
# iterate over the batches and expire them if they have stayed
|
||||
# in accumulator for more than request_timeout_ms
|
||||
dq = self._batches[tp]
|
||||
while dq:
|
||||
batch = dq[0]
|
||||
if batch.has_reached_delivery_timeout(self.delivery_timeout_ms, now=now):
|
||||
dq.popleft()
|
||||
batch.records.close()
|
||||
for batch in dq:
|
||||
is_full = bool(bool(batch != dq[-1]) or batch.records.is_full())
|
||||
# check if the batch is expired
|
||||
if batch.maybe_expire(request_timeout_ms,
|
||||
self.config['retry_backoff_ms'],
|
||||
self.config['linger_ms'],
|
||||
is_full):
|
||||
expired_batches.append(batch)
|
||||
to_remove.append(batch)
|
||||
count += 1
|
||||
self.deallocate(batch)
|
||||
else:
|
||||
# Stop at the first batch that has not expired.
|
||||
self.maybe_update_next_batch_expiry_time(batch)
|
||||
break
|
||||
|
||||
# Python does not allow us to mutate the dq during iteration
|
||||
# Assuming expired batches are infrequent, this is better than
|
||||
# creating a new copy of the deque for iteration on every loop
|
||||
if to_remove:
|
||||
for batch in to_remove:
|
||||
dq.remove(batch)
|
||||
to_remove = []
|
||||
|
||||
if expired_batches:
|
||||
log.warning("Expired %d batches in accumulator", count) # trace
|
||||
|
||||
return expired_batches
|
||||
|
||||
def reenqueue(self, batch, now=None):
|
||||
"""
|
||||
Re-enqueue the given record batch in the accumulator. In Sender._complete_batch method, we check
|
||||
whether the batch has reached delivery_timeout_ms or not. Hence we do not do the delivery timeout check here.
|
||||
"""
|
||||
batch.retry(now=now)
|
||||
with self._tp_lock(batch.topic_partition):
|
||||
dq = self._batches[batch.topic_partition]
|
||||
def reenqueue(self, batch):
|
||||
"""Re-enqueue the given record batch in the accumulator to retry."""
|
||||
now = time.time()
|
||||
batch.attempts += 1
|
||||
batch.last_attempt = now
|
||||
batch.last_append = now
|
||||
batch.set_retry()
|
||||
assert batch.topic_partition in self._tp_locks, 'TopicPartition not in locks dict'
|
||||
assert batch.topic_partition in self._batches, 'TopicPartition not in batches'
|
||||
dq = self._batches[batch.topic_partition]
|
||||
with self._tp_locks[batch.topic_partition]:
|
||||
dq.appendleft(batch)
|
||||
|
||||
def ready(self, cluster, now=None):
|
||||
def ready(self, cluster):
|
||||
"""
|
||||
Get a list of nodes whose partitions are ready to be sent, and the
|
||||
earliest time at which any non-sendable partition will be ready;
|
||||
@@ -398,8 +380,9 @@ class RecordAccumulator(object):
|
||||
ready_nodes = set()
|
||||
next_ready_check = 9999999.99
|
||||
unknown_leaders_exist = False
|
||||
now = time.time() if now is None else now
|
||||
now = time.time()
|
||||
|
||||
exhausted = bool(self._free.queued() > 0)
|
||||
# several threads are accessing self._batches -- to simplify
|
||||
# concurrent access, we iterate over a snapshot of partitions
|
||||
# and lock each partition separately as needed
|
||||
@@ -414,23 +397,23 @@ class RecordAccumulator(object):
|
||||
elif tp in self.muted:
|
||||
continue
|
||||
|
||||
with self._tp_lock(tp):
|
||||
with self._tp_locks[tp]:
|
||||
dq = self._batches[tp]
|
||||
if not dq:
|
||||
continue
|
||||
batch = dq[0]
|
||||
retry_backoff = self.config['retry_backoff_ms'] / 1000
|
||||
linger = self.config['linger_ms'] / 1000
|
||||
backing_off = bool(batch.attempts > 0
|
||||
and (batch.last_attempt + retry_backoff) > now)
|
||||
retry_backoff = self.config['retry_backoff_ms'] / 1000.0
|
||||
linger = self.config['linger_ms'] / 1000.0
|
||||
backing_off = bool(batch.attempts > 0 and
|
||||
batch.last_attempt + retry_backoff > now)
|
||||
waited_time = now - batch.last_attempt
|
||||
time_to_wait = retry_backoff if backing_off else linger
|
||||
time_left = max(time_to_wait - waited_time, 0)
|
||||
full = bool(len(dq) > 1 or batch.records.is_full())
|
||||
expired = bool(waited_time >= time_to_wait)
|
||||
|
||||
sendable = (full or expired or self._closed or
|
||||
self.flush_in_progress())
|
||||
sendable = (full or expired or exhausted or self._closed or
|
||||
self._flush_in_progress())
|
||||
|
||||
if sendable and not backing_off:
|
||||
ready_nodes.add(leader)
|
||||
@@ -444,98 +427,16 @@ class RecordAccumulator(object):
|
||||
|
||||
return ready_nodes, next_ready_check, unknown_leaders_exist
|
||||
|
||||
def has_undrained(self):
|
||||
"""Check whether there are any batches which haven't been drained"""
|
||||
def has_unsent(self):
|
||||
"""Return whether there is any unsent record in the accumulator."""
|
||||
for tp in list(self._batches.keys()):
|
||||
with self._tp_lock(tp):
|
||||
with self._tp_locks[tp]:
|
||||
dq = self._batches[tp]
|
||||
if len(dq):
|
||||
return True
|
||||
return False
|
||||
|
||||
def _should_stop_drain_batches_for_partition(self, first, tp):
|
||||
if self._transaction_manager:
|
||||
if not self._transaction_manager.is_send_to_partition_allowed(tp):
|
||||
return True
|
||||
if not self._transaction_manager.producer_id_and_epoch.is_valid:
|
||||
# we cannot send the batch until we have refreshed the PID
|
||||
log.debug("Waiting to send ready batches because transaction producer id is not valid")
|
||||
return True
|
||||
return False
|
||||
|
||||
def drain_batches_for_one_node(self, cluster, node_id, max_size, now=None):
|
||||
now = time.time() if now is None else now
|
||||
size = 0
|
||||
ready = []
|
||||
partitions = list(cluster.partitions_for_broker(node_id))
|
||||
if not partitions:
|
||||
return ready
|
||||
# to make starvation less likely this loop doesn't start at 0
|
||||
self._drain_index %= len(partitions)
|
||||
start = None
|
||||
while start != self._drain_index:
|
||||
tp = partitions[self._drain_index]
|
||||
if start is None:
|
||||
start = self._drain_index
|
||||
self._drain_index += 1
|
||||
self._drain_index %= len(partitions)
|
||||
|
||||
# Only proceed if the partition has no in-flight batches.
|
||||
if tp in self.muted:
|
||||
continue
|
||||
|
||||
if tp not in self._batches:
|
||||
continue
|
||||
|
||||
with self._tp_lock(tp):
|
||||
dq = self._batches[tp]
|
||||
if len(dq) == 0:
|
||||
continue
|
||||
first = dq[0]
|
||||
backoff = bool(first.attempts > 0 and
|
||||
first.last_attempt + self.config['retry_backoff_ms'] / 1000 > now)
|
||||
# Only drain the batch if it is not during backoff
|
||||
if backoff:
|
||||
continue
|
||||
|
||||
if (size + first.records.size_in_bytes() > max_size
|
||||
and len(ready) > 0):
|
||||
# there is a rare case that a single batch
|
||||
# size is larger than the request size due
|
||||
# to compression; in this case we will
|
||||
# still eventually send this batch in a
|
||||
# single request
|
||||
break
|
||||
else:
|
||||
if self._should_stop_drain_batches_for_partition(first, tp):
|
||||
break
|
||||
|
||||
batch = dq.popleft()
|
||||
if self._transaction_manager and not batch.in_retry():
|
||||
# If the batch is in retry, then we should not change the pid and
|
||||
# sequence number, since this may introduce duplicates. In particular,
|
||||
# the previous attempt may actually have been accepted, and if we change
|
||||
# the pid and sequence here, this attempt will also be accepted, causing
|
||||
# a duplicate.
|
||||
sequence_number = self._transaction_manager.sequence_number(batch.topic_partition)
|
||||
log.debug("Dest: %s: %s producer_id=%s epoch=%s sequence=%s",
|
||||
node_id, batch.topic_partition,
|
||||
self._transaction_manager.producer_id_and_epoch.producer_id,
|
||||
self._transaction_manager.producer_id_and_epoch.epoch,
|
||||
sequence_number)
|
||||
batch.records.set_producer_state(
|
||||
self._transaction_manager.producer_id_and_epoch.producer_id,
|
||||
self._transaction_manager.producer_id_and_epoch.epoch,
|
||||
sequence_number,
|
||||
self._transaction_manager.is_transactional()
|
||||
)
|
||||
batch.records.close()
|
||||
size += batch.records.size_in_bytes()
|
||||
ready.append(batch)
|
||||
batch.drained = now
|
||||
return ready
|
||||
|
||||
def drain(self, cluster, nodes, max_size, now=None):
|
||||
def drain(self, cluster, nodes, max_size):
|
||||
"""
|
||||
Drain all the data for the given nodes and collate them into a list of
|
||||
batches that will fit within the specified size on a per-node basis.
|
||||
@@ -553,17 +454,59 @@ class RecordAccumulator(object):
|
||||
if not nodes:
|
||||
return {}
|
||||
|
||||
now = time.time() if now is None else now
|
||||
now = time.time()
|
||||
batches = {}
|
||||
for node_id in nodes:
|
||||
batches[node_id] = self.drain_batches_for_one_node(cluster, node_id, max_size, now=now)
|
||||
size = 0
|
||||
partitions = list(cluster.partitions_for_broker(node_id))
|
||||
ready = []
|
||||
# to make starvation less likely this loop doesn't start at 0
|
||||
self._drain_index %= len(partitions)
|
||||
start = self._drain_index
|
||||
while True:
|
||||
tp = partitions[self._drain_index]
|
||||
if tp in self._batches and tp not in self.muted:
|
||||
with self._tp_locks[tp]:
|
||||
dq = self._batches[tp]
|
||||
if dq:
|
||||
first = dq[0]
|
||||
backoff = (
|
||||
bool(first.attempts > 0) and
|
||||
bool(first.last_attempt +
|
||||
self.config['retry_backoff_ms'] / 1000.0
|
||||
> now)
|
||||
)
|
||||
# Only drain the batch if it is not during backoff
|
||||
if not backoff:
|
||||
if (size + first.records.size_in_bytes() > max_size
|
||||
and len(ready) > 0):
|
||||
# there is a rare case that a single batch
|
||||
# size is larger than the request size due
|
||||
# to compression; in this case we will
|
||||
# still eventually send this batch in a
|
||||
# single request
|
||||
break
|
||||
else:
|
||||
batch = dq.popleft()
|
||||
batch.records.close()
|
||||
size += batch.records.size_in_bytes()
|
||||
ready.append(batch)
|
||||
batch.drained = now
|
||||
|
||||
self._drain_index += 1
|
||||
self._drain_index %= len(partitions)
|
||||
if start == self._drain_index:
|
||||
break
|
||||
|
||||
batches[node_id] = ready
|
||||
return batches
|
||||
|
||||
def deallocate(self, batch):
|
||||
"""Deallocate the record batch."""
|
||||
self._incomplete.remove(batch)
|
||||
self._free.deallocate(batch.buffer())
|
||||
|
||||
def flush_in_progress(self):
|
||||
def _flush_in_progress(self):
|
||||
"""Are there any threads currently waiting on a flush?"""
|
||||
return self._flushes_in_progress.get() > 0
|
||||
|
||||
@@ -592,10 +535,6 @@ class RecordAccumulator(object):
|
||||
finally:
|
||||
self._flushes_in_progress.decrement()
|
||||
|
||||
@property
|
||||
def has_incomplete(self):
|
||||
return bool(self._incomplete)
|
||||
|
||||
def abort_incomplete_batches(self):
|
||||
"""
|
||||
This function is only called when sender is closed forcefully. It will fail all the
|
||||
@@ -605,41 +544,27 @@ class RecordAccumulator(object):
|
||||
# 1. Avoid losing batches.
|
||||
# 2. Free up memory in case appending threads are blocked on buffer full.
|
||||
# This is a tight loop but should be able to get through very quickly.
|
||||
error = Errors.IllegalStateError("Producer is closed forcefully.")
|
||||
while True:
|
||||
self._abort_batches(error)
|
||||
self._abort_batches()
|
||||
if not self._appends_in_progress.get():
|
||||
break
|
||||
# After this point, no thread will append any messages because they will see the close
|
||||
# flag set. We need to do the last abort after no thread was appending in case the there was a new
|
||||
# batch appended by the last appending thread.
|
||||
self._abort_batches(error)
|
||||
self._abort_batches()
|
||||
self._batches.clear()
|
||||
|
||||
def _abort_batches(self, error):
|
||||
def _abort_batches(self):
|
||||
"""Go through incomplete batches and abort them."""
|
||||
error = Errors.IllegalStateError("Producer is closed forcefully.")
|
||||
for batch in self._incomplete.all():
|
||||
tp = batch.topic_partition
|
||||
# Close the batch before aborting
|
||||
with self._tp_lock(tp):
|
||||
with self._tp_locks[tp]:
|
||||
batch.records.close()
|
||||
self._batches[tp].remove(batch)
|
||||
batch.abort(error)
|
||||
batch.done(exception=error)
|
||||
self.deallocate(batch)
|
||||
|
||||
def abort_undrained_batches(self, error):
|
||||
for batch in self._incomplete.all():
|
||||
tp = batch.topic_partition
|
||||
with self._tp_lock(tp):
|
||||
aborted = False
|
||||
if not batch.is_done:
|
||||
aborted = True
|
||||
batch.records.close()
|
||||
self._batches[tp].remove(batch)
|
||||
if aborted:
|
||||
batch.abort(error)
|
||||
self.deallocate(batch)
|
||||
|
||||
def close(self):
|
||||
"""Close this accumulator and force all the record buffers to be drained."""
|
||||
self._closed = True
|
||||
@@ -654,21 +579,12 @@ class IncompleteProducerBatches(object):
|
||||
|
||||
def add(self, batch):
|
||||
with self._lock:
|
||||
self._incomplete.add(batch)
|
||||
return self._incomplete.add(batch)
|
||||
|
||||
def remove(self, batch):
|
||||
with self._lock:
|
||||
try:
|
||||
self._incomplete.remove(batch)
|
||||
except KeyError:
|
||||
pass
|
||||
return self._incomplete.remove(batch)
|
||||
|
||||
def all(self):
|
||||
with self._lock:
|
||||
return list(self._incomplete)
|
||||
|
||||
def __bool__(self):
|
||||
return bool(self._incomplete)
|
||||
|
||||
|
||||
__nonzero__ = __bool__
|
||||
|
||||
@@ -2,7 +2,6 @@ from __future__ import absolute_import, division
|
||||
|
||||
import collections
|
||||
import copy
|
||||
import heapq
|
||||
import logging
|
||||
import threading
|
||||
import time
|
||||
@@ -12,8 +11,6 @@ from kafka.vendor import six
|
||||
from kafka import errors as Errors
|
||||
from kafka.metrics.measurable import AnonMeasurable
|
||||
from kafka.metrics.stats import Avg, Max, Rate
|
||||
from kafka.producer.transaction_manager import ProducerIdAndEpoch
|
||||
from kafka.protocol.init_producer_id import InitProducerIdRequest
|
||||
from kafka.protocol.produce import ProduceRequest
|
||||
from kafka.structs import TopicPartition
|
||||
from kafka.version import __version__
|
||||
@@ -30,18 +27,14 @@ class Sender(threading.Thread):
|
||||
DEFAULT_CONFIG = {
|
||||
'max_request_size': 1048576,
|
||||
'acks': 1,
|
||||
'retries': float('inf'),
|
||||
'retries': 0,
|
||||
'request_timeout_ms': 30000,
|
||||
'retry_backoff_ms': 100,
|
||||
'metrics': None,
|
||||
'guarantee_message_order': False,
|
||||
'transaction_manager': None,
|
||||
'transactional_id': None,
|
||||
'transaction_timeout_ms': 60000,
|
||||
'client_id': 'kafka-python-' + __version__,
|
||||
'api_version': (0, 8, 0),
|
||||
}
|
||||
|
||||
def __init__(self, client, metadata, accumulator, **configs):
|
||||
def __init__(self, client, metadata, accumulator, metrics, **configs):
|
||||
super(Sender, self).__init__()
|
||||
self.config = copy.copy(self.DEFAULT_CONFIG)
|
||||
for key in self.config:
|
||||
@@ -55,75 +48,32 @@ class Sender(threading.Thread):
|
||||
self._running = True
|
||||
self._force_close = False
|
||||
self._topics_to_add = set()
|
||||
if self.config['metrics']:
|
||||
self._sensors = SenderMetrics(self.config['metrics'], self._client, self._metadata)
|
||||
else:
|
||||
self._sensors = None
|
||||
self._transaction_manager = self.config['transaction_manager']
|
||||
# A per-partition queue of batches ordered by creation time for tracking the in-flight batches
|
||||
self._in_flight_batches = collections.defaultdict(list)
|
||||
|
||||
def _maybe_remove_from_inflight_batches(self, batch):
|
||||
try:
|
||||
queue = self._in_flight_batches[batch.topic_partition]
|
||||
except KeyError:
|
||||
return
|
||||
try:
|
||||
idx = queue.index((batch.created, batch))
|
||||
except ValueError:
|
||||
return
|
||||
# https://stackoverflow.com/questions/10162679/python-delete-element-from-heap
|
||||
queue[idx] = queue[-1]
|
||||
queue.pop()
|
||||
heapq.heapify(queue)
|
||||
|
||||
def _get_expired_inflight_batches(self, now=None):
|
||||
"""Get the in-flight batches that has reached delivery timeout."""
|
||||
expired_batches = []
|
||||
to_remove = []
|
||||
for tp, queue in six.iteritems(self._in_flight_batches):
|
||||
while queue:
|
||||
_created_at, batch = queue[0]
|
||||
if batch.has_reached_delivery_timeout(self._accumulator.delivery_timeout_ms):
|
||||
heapq.heappop(queue)
|
||||
if batch.final_state is None:
|
||||
expired_batches.append(batch)
|
||||
else:
|
||||
raise Errors.IllegalStateError("%s batch created at %s gets unexpected final state %s" % (batch.topic_partition, batch.created, batch.final_state))
|
||||
else:
|
||||
self._accumulator.maybe_update_next_batch_expiry_time(batch)
|
||||
break
|
||||
else:
|
||||
# Avoid mutating in_flight_batches during iteration
|
||||
to_remove.append(tp)
|
||||
for tp in to_remove:
|
||||
del self._in_flight_batches[tp]
|
||||
return expired_batches
|
||||
self._sensors = SenderMetrics(metrics, self._client, self._metadata)
|
||||
|
||||
def run(self):
|
||||
"""The main run loop for the sender thread."""
|
||||
log.debug("%s: Starting Kafka producer I/O thread.", str(self))
|
||||
log.debug("Starting Kafka producer I/O thread.")
|
||||
|
||||
# main loop, runs until close is called
|
||||
while self._running:
|
||||
try:
|
||||
self.run_once()
|
||||
except Exception:
|
||||
log.exception("%s: Uncaught error in kafka producer I/O thread", str(self))
|
||||
log.exception("Uncaught error in kafka producer I/O thread")
|
||||
|
||||
log.debug("%s: Beginning shutdown of Kafka producer I/O thread, sending"
|
||||
" remaining records.", str(self))
|
||||
log.debug("Beginning shutdown of Kafka producer I/O thread, sending"
|
||||
" remaining records.")
|
||||
|
||||
# okay we stopped accepting requests but there may still be
|
||||
# requests in the accumulator or waiting for acknowledgment,
|
||||
# wait until these are completed.
|
||||
while (not self._force_close
|
||||
and (self._accumulator.has_undrained()
|
||||
and (self._accumulator.has_unsent()
|
||||
or self._client.in_flight_request_count() > 0)):
|
||||
try:
|
||||
self.run_once()
|
||||
except Exception:
|
||||
log.exception("%s: Uncaught error in kafka producer I/O thread", str(self))
|
||||
log.exception("Uncaught error in kafka producer I/O thread")
|
||||
|
||||
if self._force_close:
|
||||
# We need to fail all the incomplete batches and wake up the
|
||||
@@ -133,75 +83,38 @@ class Sender(threading.Thread):
|
||||
try:
|
||||
self._client.close()
|
||||
except Exception:
|
||||
log.exception("%s: Failed to close network client", str(self))
|
||||
log.exception("Failed to close network client")
|
||||
|
||||
log.debug("%s: Shutdown of Kafka producer I/O thread has completed.", str(self))
|
||||
log.debug("Shutdown of Kafka producer I/O thread has completed.")
|
||||
|
||||
def run_once(self):
|
||||
"""Run a single iteration of sending."""
|
||||
while self._topics_to_add:
|
||||
self._client.add_topic(self._topics_to_add.pop())
|
||||
|
||||
if self._transaction_manager:
|
||||
try:
|
||||
if not self._transaction_manager.is_transactional():
|
||||
# this is an idempotent producer, so make sure we have a producer id
|
||||
self._maybe_wait_for_producer_id()
|
||||
elif self._transaction_manager.has_in_flight_transactional_request() or self._maybe_send_transactional_request():
|
||||
# as long as there are outstanding transactional requests, we simply wait for them to return
|
||||
self._client.poll(timeout_ms=self.config['retry_backoff_ms'])
|
||||
return
|
||||
|
||||
# do not continue sending if the transaction manager is in a failed state or if there
|
||||
# is no producer id (for the idempotent case).
|
||||
if self._transaction_manager.has_fatal_error() or not self._transaction_manager.has_producer_id():
|
||||
last_error = self._transaction_manager.last_error
|
||||
if last_error is not None:
|
||||
self._maybe_abort_batches(last_error)
|
||||
self._client.poll(timeout_ms=self.config['retry_backoff_ms'])
|
||||
return
|
||||
elif self._transaction_manager.has_abortable_error():
|
||||
self._accumulator.abort_undrained_batches(self._transaction_manager.last_error)
|
||||
|
||||
except Errors.SaslAuthenticationFailedError as e:
|
||||
# This is already logged as error, but propagated here to perform any clean ups.
|
||||
log.debug("%s: Authentication exception while processing transactional request: %s", str(self), e)
|
||||
self._transaction_manager.authentication_failed(e)
|
||||
|
||||
poll_timeout_ms = self._send_producer_data()
|
||||
self._client.poll(timeout_ms=poll_timeout_ms)
|
||||
|
||||
def _send_producer_data(self, now=None):
|
||||
now = time.time() if now is None else now
|
||||
# get the list of partitions with data ready to send
|
||||
result = self._accumulator.ready(self._metadata, now=now)
|
||||
result = self._accumulator.ready(self._metadata)
|
||||
ready_nodes, next_ready_check_delay, unknown_leaders_exist = result
|
||||
|
||||
# if there are any partitions whose leaders are not known yet, force
|
||||
# metadata update
|
||||
if unknown_leaders_exist:
|
||||
log.debug('%s: Unknown leaders exist, requesting metadata update', str(self))
|
||||
log.debug('Unknown leaders exist, requesting metadata update')
|
||||
self._metadata.request_update()
|
||||
|
||||
# remove any nodes we aren't ready to send to
|
||||
not_ready_timeout_ms = float('inf')
|
||||
not_ready_timeout = float('inf')
|
||||
for node in list(ready_nodes):
|
||||
if not self._client.is_ready(node):
|
||||
node_delay_ms = self._client.connection_delay(node)
|
||||
log.debug('%s: Node %s not ready; delaying produce of accumulated batch (%f ms)', str(self), node, node_delay_ms)
|
||||
log.debug('Node %s not ready; delaying produce of accumulated batch', node)
|
||||
self._client.maybe_connect(node, wakeup=False)
|
||||
ready_nodes.remove(node)
|
||||
not_ready_timeout_ms = min(not_ready_timeout_ms, node_delay_ms)
|
||||
not_ready_timeout = min(not_ready_timeout,
|
||||
self._client.connection_delay(node))
|
||||
|
||||
# create produce requests
|
||||
batches_by_node = self._accumulator.drain(
|
||||
self._metadata, ready_nodes, self.config['max_request_size'], now=now)
|
||||
|
||||
for batch_list in six.itervalues(batches_by_node):
|
||||
for batch in batch_list:
|
||||
item = (batch.created, batch)
|
||||
queue = self._in_flight_batches[batch.topic_partition]
|
||||
heapq.heappush(queue, item)
|
||||
self._metadata, ready_nodes, self.config['max_request_size'])
|
||||
|
||||
if self.config['guarantee_message_order']:
|
||||
# Mute all the partitions drained
|
||||
@@ -209,130 +122,42 @@ class Sender(threading.Thread):
|
||||
for batch in batch_list:
|
||||
self._accumulator.muted.add(batch.topic_partition)
|
||||
|
||||
self._accumulator.reset_next_batch_expiry_time()
|
||||
expired_batches = self._accumulator.expired_batches(now=now)
|
||||
expired_batches.extend(self._get_expired_inflight_batches(now=now))
|
||||
|
||||
if expired_batches:
|
||||
log.debug("%s: Expired %s batches in accumulator", str(self), len(expired_batches))
|
||||
|
||||
# Reset the producer_id if an expired batch has previously been sent to the broker.
|
||||
# See the documentation of `TransactionState.reset_producer_id` to understand why
|
||||
# we need to reset the producer id here.
|
||||
if self._transaction_manager and any([batch.in_retry() for batch in expired_batches]):
|
||||
needs_transaction_state_reset = True
|
||||
else:
|
||||
needs_transaction_state_reset = False
|
||||
|
||||
expired_batches = self._accumulator.abort_expired_batches(
|
||||
self.config['request_timeout_ms'], self._metadata)
|
||||
for expired_batch in expired_batches:
|
||||
error = Errors.KafkaTimeoutError(
|
||||
"Expiring %d record(s) for %s: %s ms has passed since batch creation" % (
|
||||
expired_batch.record_count, expired_batch.topic_partition,
|
||||
int((time.time() - expired_batch.created) * 1000)))
|
||||
self._fail_batch(expired_batch, error, base_offset=-1)
|
||||
|
||||
if self._sensors:
|
||||
self._sensors.update_produce_request_metrics(batches_by_node)
|
||||
|
||||
if needs_transaction_state_reset:
|
||||
self._transaction_manager.reset_producer_id()
|
||||
return 0
|
||||
self._sensors.record_errors(expired_batch.topic_partition.topic, expired_batch.record_count)
|
||||
|
||||
self._sensors.update_produce_request_metrics(batches_by_node)
|
||||
requests = self._create_produce_requests(batches_by_node)
|
||||
# If we have any nodes that are ready to send + have sendable data,
|
||||
# poll with 0 timeout so this can immediately loop and try sending more
|
||||
# data. Otherwise, the timeout will be the smaller value between next
|
||||
# batch expiry time, and the delay time for checking data availability.
|
||||
# Note that the nodes may have data that isn't yet sendable due to
|
||||
# lingering, backing off, etc. This specifically does not include nodes with
|
||||
# data. Otherwise, the timeout is determined by nodes that have
|
||||
# partitions with data that isn't yet sendable (e.g. lingering, backing
|
||||
# off). Note that this specifically does not include nodes with
|
||||
# sendable data that aren't ready to send since they would cause busy
|
||||
# looping.
|
||||
poll_timeout_ms = min(next_ready_check_delay * 1000,
|
||||
not_ready_timeout_ms,
|
||||
self._accumulator.next_expiry_time_ms - now * 1000)
|
||||
if poll_timeout_ms < 0:
|
||||
poll_timeout_ms = 0
|
||||
|
||||
poll_timeout_ms = min(next_ready_check_delay * 1000, not_ready_timeout)
|
||||
if ready_nodes:
|
||||
log.debug("%s: Nodes with data ready to send: %s", str(self), ready_nodes) # trace
|
||||
log.debug("%s: Created %d produce requests: %s", str(self), len(requests), requests) # trace
|
||||
# if some partitions are already ready to be sent, the select time
|
||||
# would be 0; otherwise if some partition already has some data
|
||||
# accumulated but not ready yet, the select time will be the time
|
||||
# difference between now and its linger expiry time; otherwise the
|
||||
# select time will be the time difference between now and the
|
||||
# metadata expiry time
|
||||
log.debug("Nodes with data ready to send: %s", ready_nodes) # trace
|
||||
log.debug("Created %d produce requests: %s", len(requests), requests) # trace
|
||||
poll_timeout_ms = 0
|
||||
|
||||
for node_id, request in six.iteritems(requests):
|
||||
batches = batches_by_node[node_id]
|
||||
log.debug('%s: Sending Produce Request: %r', str(self), request)
|
||||
log.debug('Sending Produce Request: %r', request)
|
||||
(self._client.send(node_id, request, wakeup=False)
|
||||
.add_callback(
|
||||
self._handle_produce_response, node_id, time.time(), batches)
|
||||
.add_errback(
|
||||
self._failed_produce, batches, node_id))
|
||||
return poll_timeout_ms
|
||||
|
||||
def _maybe_send_transactional_request(self):
|
||||
if self._transaction_manager.is_completing() and self._accumulator.has_incomplete:
|
||||
if self._transaction_manager.is_aborting():
|
||||
self._accumulator.abort_undrained_batches(Errors.KafkaError("Failing batch since transaction was aborted"))
|
||||
# There may still be requests left which are being retried. Since we do not know whether they had
|
||||
# been successfully appended to the broker log, we must resend them until their final status is clear.
|
||||
# If they had been appended and we did not receive the error, then our sequence number would no longer
|
||||
# be correct which would lead to an OutOfSequenceNumberError.
|
||||
if not self._accumulator.flush_in_progress():
|
||||
self._accumulator.begin_flush()
|
||||
|
||||
next_request_handler = self._transaction_manager.next_request_handler(self._accumulator.has_incomplete)
|
||||
if next_request_handler is None:
|
||||
return False
|
||||
|
||||
log.debug("%s: Sending transactional request %s", str(self), next_request_handler.request)
|
||||
while not self._force_close:
|
||||
target_node = None
|
||||
try:
|
||||
if next_request_handler.needs_coordinator():
|
||||
target_node = self._transaction_manager.coordinator(next_request_handler.coordinator_type)
|
||||
if target_node is None:
|
||||
self._transaction_manager.lookup_coordinator_for_request(next_request_handler)
|
||||
break
|
||||
elif not self._client.await_ready(target_node, timeout_ms=self.config['request_timeout_ms']):
|
||||
self._transaction_manager.lookup_coordinator_for_request(next_request_handler)
|
||||
target_node = None
|
||||
break
|
||||
else:
|
||||
target_node = self._client.least_loaded_node()
|
||||
if target_node is not None and not self._client.await_ready(target_node, timeout_ms=self.config['request_timeout_ms']):
|
||||
target_node = None
|
||||
|
||||
if target_node is not None:
|
||||
if next_request_handler.is_retry:
|
||||
time.sleep(self.config['retry_backoff_ms'] / 1000)
|
||||
txn_correlation_id = self._transaction_manager.next_in_flight_request_correlation_id()
|
||||
future = self._client.send(target_node, next_request_handler.request)
|
||||
future.add_both(next_request_handler.on_complete, txn_correlation_id)
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
log.warn("%s: Got an exception when trying to find a node to send a transactional request to. Going to back off and retry: %s", str(self), e)
|
||||
if next_request_handler.needs_coordinator():
|
||||
self._transaction_manager.lookup_coordinator_for_request(next_request_handler)
|
||||
break
|
||||
|
||||
time.sleep(self.config['retry_backoff_ms'] / 1000)
|
||||
self._metadata.request_update()
|
||||
|
||||
if target_node is None:
|
||||
self._transaction_manager.retry(next_request_handler)
|
||||
|
||||
return True
|
||||
|
||||
def _maybe_abort_batches(self, exc):
|
||||
if self._accumulator.has_incomplete:
|
||||
log.error("%s: Aborting producer batches due to fatal error: %s", str(self), exc)
|
||||
self._accumulator.abort_batches(exc)
|
||||
# if some partitions are already ready to be sent, the select time
|
||||
# would be 0; otherwise if some partition already has some data
|
||||
# accumulated but not ready yet, the select time will be the time
|
||||
# difference between now and its linger expiry time; otherwise the
|
||||
# select time will be the time difference between now and the
|
||||
# metadata expiry time
|
||||
self._client.poll(timeout_ms=poll_timeout_ms)
|
||||
|
||||
def initiate_close(self):
|
||||
"""Start closing the sender (won't complete until all data is sent)."""
|
||||
@@ -355,164 +180,82 @@ class Sender(threading.Thread):
|
||||
self._topics_to_add.add(topic)
|
||||
self.wakeup()
|
||||
|
||||
def _maybe_wait_for_producer_id(self):
|
||||
while not self._transaction_manager.has_producer_id():
|
||||
try:
|
||||
node_id = self._client.least_loaded_node()
|
||||
if node_id is None or not self._client.await_ready(node_id):
|
||||
log.debug("%s, Could not find an available broker to send InitProducerIdRequest to." +
|
||||
" Will back off and try again.", str(self))
|
||||
time.sleep(self._client.least_loaded_node_refresh_ms() / 1000)
|
||||
continue
|
||||
version = self._client.api_version(InitProducerIdRequest, max_version=1)
|
||||
request = InitProducerIdRequest[version](
|
||||
transactional_id=self.config['transactional_id'],
|
||||
transaction_timeout_ms=self.config['transaction_timeout_ms'],
|
||||
)
|
||||
response = self._client.send_and_receive(node_id, request)
|
||||
error_type = Errors.for_code(response.error_code)
|
||||
if error_type is Errors.NoError:
|
||||
self._transaction_manager.set_producer_id_and_epoch(ProducerIdAndEpoch(response.producer_id, response.producer_epoch))
|
||||
break
|
||||
elif getattr(error_type, 'retriable', False):
|
||||
log.debug("%s: Retriable error from InitProducerId response: %s", str(self), error_type.__name__)
|
||||
if getattr(error_type, 'invalid_metadata', False):
|
||||
self._metadata.request_update()
|
||||
else:
|
||||
self._transaction_manager.transition_to_fatal_error(error_type())
|
||||
break
|
||||
except Errors.KafkaConnectionError:
|
||||
log.debug("%s: Broker %s disconnected while awaiting InitProducerId response", str(self), node_id)
|
||||
except Errors.RequestTimedOutError:
|
||||
log.debug("%s: InitProducerId request to node %s timed out", str(self), node_id)
|
||||
log.debug("%s: Retry InitProducerIdRequest in %sms.", str(self), self.config['retry_backoff_ms'])
|
||||
time.sleep(self.config['retry_backoff_ms'] / 1000)
|
||||
|
||||
def _failed_produce(self, batches, node_id, error):
|
||||
log.error("%s: Error sending produce request to node %d: %s", str(self), node_id, error) # trace
|
||||
log.debug("Error sending produce request to node %d: %s", node_id, error) # trace
|
||||
for batch in batches:
|
||||
self._complete_batch(batch, error, -1)
|
||||
self._complete_batch(batch, error, -1, None)
|
||||
|
||||
def _handle_produce_response(self, node_id, send_time, batches, response):
|
||||
"""Handle a produce response."""
|
||||
# if we have a response, parse it
|
||||
log.debug('%s: Parsing produce response: %r', str(self), response)
|
||||
log.debug('Parsing produce response: %r', response)
|
||||
if response:
|
||||
batches_by_partition = dict([(batch.topic_partition, batch)
|
||||
for batch in batches])
|
||||
|
||||
for topic, partitions in response.topics:
|
||||
for partition_info in partitions:
|
||||
global_error = None
|
||||
log_start_offset = None
|
||||
if response.API_VERSION < 2:
|
||||
partition, error_code, offset = partition_info
|
||||
ts = None
|
||||
elif 2 <= response.API_VERSION <= 4:
|
||||
partition, error_code, offset, ts = partition_info
|
||||
elif 5 <= response.API_VERSION <= 7:
|
||||
partition, error_code, offset, ts, _log_start_offset = partition_info
|
||||
partition, error_code, offset, ts, log_start_offset = partition_info
|
||||
else:
|
||||
# Currently unused / TODO: KIP-467
|
||||
partition, error_code, offset, ts, _log_start_offset, _record_errors, _global_error = partition_info
|
||||
# the ignored parameter is record_error of type list[(batch_index: int, error_message: str)]
|
||||
partition, error_code, offset, ts, log_start_offset, _, global_error = partition_info
|
||||
tp = TopicPartition(topic, partition)
|
||||
error = Errors.for_code(error_code)
|
||||
batch = batches_by_partition[tp]
|
||||
self._complete_batch(batch, error, offset, timestamp_ms=ts)
|
||||
self._complete_batch(batch, error, offset, ts, log_start_offset, global_error)
|
||||
|
||||
if response.API_VERSION > 0:
|
||||
self._sensors.record_throttle_time(response.throttle_time_ms, node=node_id)
|
||||
|
||||
else:
|
||||
# this is the acks = 0 case, just complete all requests
|
||||
for batch in batches:
|
||||
self._complete_batch(batch, None, -1)
|
||||
self._complete_batch(batch, None, -1, None)
|
||||
|
||||
def _fail_batch(self, batch, exception, base_offset=None, timestamp_ms=None):
|
||||
exception = exception if type(exception) is not type else exception()
|
||||
if self._transaction_manager:
|
||||
if isinstance(exception, Errors.OutOfOrderSequenceNumberError) and \
|
||||
not self._transaction_manager.is_transactional() and \
|
||||
self._transaction_manager.has_producer_id(batch.producer_id):
|
||||
log.error("%s: The broker received an out of order sequence number for topic-partition %s"
|
||||
" at offset %s. This indicates data loss on the broker, and should be investigated.",
|
||||
str(self), batch.topic_partition, base_offset)
|
||||
|
||||
# Reset the transaction state since we have hit an irrecoverable exception and cannot make any guarantees
|
||||
# about the previously committed message. Note that this will discard the producer id and sequence
|
||||
# numbers for all existing partitions.
|
||||
self._transaction_manager.reset_producer_id()
|
||||
elif isinstance(exception, (Errors.ClusterAuthorizationFailedError,
|
||||
Errors.TransactionalIdAuthorizationFailedError,
|
||||
Errors.ProducerFencedError,
|
||||
Errors.InvalidTxnStateError)):
|
||||
self._transaction_manager.transition_to_fatal_error(exception)
|
||||
elif self._transaction_manager.is_transactional():
|
||||
self._transaction_manager.transition_to_abortable_error(exception)
|
||||
|
||||
if self._sensors:
|
||||
self._sensors.record_errors(batch.topic_partition.topic, batch.record_count)
|
||||
|
||||
if batch.done(base_offset=base_offset, timestamp_ms=timestamp_ms, exception=exception):
|
||||
self._maybe_remove_from_inflight_batches(batch)
|
||||
self._accumulator.deallocate(batch)
|
||||
|
||||
def _complete_batch(self, batch, error, base_offset, timestamp_ms=None):
|
||||
def _complete_batch(self, batch, error, base_offset, timestamp_ms=None, log_start_offset=None, global_error=None):
|
||||
"""Complete or retry the given batch of records.
|
||||
|
||||
Arguments:
|
||||
batch (ProducerBatch): The record batch
|
||||
batch (RecordBatch): The record batch
|
||||
error (Exception): The error (or None if none)
|
||||
base_offset (int): The base offset assigned to the records if successful
|
||||
timestamp_ms (int, optional): The timestamp returned by the broker for this batch
|
||||
log_start_offset (int): The start offset of the log at the time this produce response was created
|
||||
global_error (str): The summarising error message
|
||||
"""
|
||||
# Standardize no-error to None
|
||||
if error is Errors.NoError:
|
||||
error = None
|
||||
|
||||
if error is not None:
|
||||
if self._can_retry(batch, error):
|
||||
# retry
|
||||
log.warning("%s: Got error produce response on topic-partition %s,"
|
||||
" retrying (%s attempts left). Error: %s",
|
||||
str(self), batch.topic_partition,
|
||||
self.config['retries'] - batch.attempts - 1,
|
||||
error)
|
||||
|
||||
# If idempotence is enabled only retry the request if the batch matches our current producer id and epoch
|
||||
if not self._transaction_manager or self._transaction_manager.producer_id_and_epoch.match(batch):
|
||||
log.debug("%s: Retrying batch to topic-partition %s. Sequence number: %s",
|
||||
str(self), batch.topic_partition,
|
||||
self._transaction_manager.sequence_number(batch.topic_partition) if self._transaction_manager else None)
|
||||
self._accumulator.reenqueue(batch)
|
||||
self._maybe_remove_from_inflight_batches(batch)
|
||||
if self._sensors:
|
||||
self._sensors.record_retries(batch.topic_partition.topic, batch.record_count)
|
||||
else:
|
||||
log.warning("%s: Attempted to retry sending a batch but the producer id/epoch changed from %s/%s to %s/%s. This batch will be dropped",
|
||||
str(self), batch.producer_id, batch.producer_epoch,
|
||||
self._transaction_manager.producer_id_and_epoch.producer_id,
|
||||
self._transaction_manager.producer_id_and_epoch.epoch)
|
||||
self._fail_batch(batch, error, base_offset=base_offset, timestamp_ms=timestamp_ms)
|
||||
else:
|
||||
if error is Errors.TopicAuthorizationFailedError:
|
||||
error = error(batch.topic_partition.topic)
|
||||
|
||||
# tell the user the result of their request
|
||||
self._fail_batch(batch, error, base_offset=base_offset, timestamp_ms=timestamp_ms)
|
||||
|
||||
if error is Errors.UnknownTopicOrPartitionError:
|
||||
log.warning("%s: Received unknown topic or partition error in produce request on partition %s."
|
||||
" The topic/partition may not exist or the user may not have Describe access to it",
|
||||
str(self), batch.topic_partition)
|
||||
|
||||
if getattr(error, 'invalid_metadata', False):
|
||||
self._metadata.request_update()
|
||||
|
||||
if error is not None and self._can_retry(batch, error):
|
||||
# retry
|
||||
log.warning("Got error produce response on topic-partition %s,"
|
||||
" retrying (%d attempts left). Error: %s",
|
||||
batch.topic_partition,
|
||||
self.config['retries'] - batch.attempts - 1,
|
||||
global_error or error)
|
||||
self._accumulator.reenqueue(batch)
|
||||
self._sensors.record_retries(batch.topic_partition.topic, batch.record_count)
|
||||
else:
|
||||
if batch.done(base_offset=base_offset, timestamp_ms=timestamp_ms):
|
||||
self._maybe_remove_from_inflight_batches(batch)
|
||||
self._accumulator.deallocate(batch)
|
||||
if error is Errors.TopicAuthorizationFailedError:
|
||||
error = error(batch.topic_partition.topic)
|
||||
|
||||
if self._transaction_manager and self._transaction_manager.producer_id_and_epoch.match(batch):
|
||||
self._transaction_manager.increment_sequence_number(batch.topic_partition, batch.record_count)
|
||||
log.debug("%s: Incremented sequence number for topic-partition %s to %s", str(self), batch.topic_partition,
|
||||
self._transaction_manager.sequence_number(batch.topic_partition))
|
||||
# tell the user the result of their request
|
||||
batch.done(base_offset, timestamp_ms, error, log_start_offset, global_error)
|
||||
self._accumulator.deallocate(batch)
|
||||
if error is not None:
|
||||
self._sensors.record_errors(batch.topic_partition.topic, batch.record_count)
|
||||
|
||||
if getattr(error, 'invalid_metadata', False):
|
||||
self._metadata.request_update()
|
||||
|
||||
# Unmute the completed partition.
|
||||
if self.config['guarantee_message_order']:
|
||||
@@ -523,10 +266,8 @@ class Sender(threading.Thread):
|
||||
We can retry a send if the error is transient and the number of
|
||||
attempts taken is fewer than the maximum allowed
|
||||
"""
|
||||
return (not batch.has_reached_delivery_timeout(self._accumulator.delivery_timeout_ms) and
|
||||
batch.attempts < self.config['retries'] and
|
||||
batch.final_state is None and
|
||||
getattr(error, 'retriable', False))
|
||||
return (batch.attempts < self.config['retries']
|
||||
and getattr(error, 'retriable', False))
|
||||
|
||||
def _create_produce_requests(self, collated):
|
||||
"""
|
||||
@@ -534,24 +275,23 @@ class Sender(threading.Thread):
|
||||
per-node basis.
|
||||
|
||||
Arguments:
|
||||
collated: {node_id: [ProducerBatch]}
|
||||
collated: {node_id: [RecordBatch]}
|
||||
|
||||
Returns:
|
||||
dict: {node_id: ProduceRequest} (version depends on client api_versions)
|
||||
dict: {node_id: ProduceRequest} (version depends on api_version)
|
||||
"""
|
||||
requests = {}
|
||||
for node_id, batches in six.iteritems(collated):
|
||||
if batches:
|
||||
requests[node_id] = self._produce_request(
|
||||
node_id, self.config['acks'],
|
||||
self.config['request_timeout_ms'], batches)
|
||||
requests[node_id] = self._produce_request(
|
||||
node_id, self.config['acks'],
|
||||
self.config['request_timeout_ms'], batches)
|
||||
return requests
|
||||
|
||||
def _produce_request(self, node_id, acks, timeout, batches):
|
||||
"""Create a produce request from the given record batches.
|
||||
|
||||
Returns:
|
||||
ProduceRequest (version depends on client api_versions)
|
||||
ProduceRequest (version depends on api_version)
|
||||
"""
|
||||
produce_records_by_partition = collections.defaultdict(dict)
|
||||
for batch in batches:
|
||||
@@ -561,26 +301,32 @@ class Sender(threading.Thread):
|
||||
buf = batch.records.buffer()
|
||||
produce_records_by_partition[topic][partition] = buf
|
||||
|
||||
version = self._client.api_version(ProduceRequest, max_version=7)
|
||||
topic_partition_data = [
|
||||
(topic, list(partition_info.items()))
|
||||
for topic, partition_info in six.iteritems(produce_records_by_partition)]
|
||||
transactional_id = self._transaction_manager.transactional_id if self._transaction_manager else None
|
||||
if version >= 3:
|
||||
return ProduceRequest[version](
|
||||
transactional_id=transactional_id,
|
||||
required_acks=acks,
|
||||
timeout=timeout,
|
||||
topics=topic_partition_data,
|
||||
)
|
||||
kwargs = {}
|
||||
if self.config['api_version'] >= (2, 1):
|
||||
version = 7
|
||||
elif self.config['api_version'] >= (2, 0):
|
||||
version = 6
|
||||
elif self.config['api_version'] >= (1, 1):
|
||||
version = 5
|
||||
elif self.config['api_version'] >= (1, 0):
|
||||
version = 4
|
||||
elif self.config['api_version'] >= (0, 11):
|
||||
version = 3
|
||||
kwargs = dict(transactional_id=None)
|
||||
elif self.config['api_version'] >= (0, 10):
|
||||
version = 2
|
||||
elif self.config['api_version'] == (0, 9):
|
||||
version = 1
|
||||
else:
|
||||
if transactional_id is not None:
|
||||
log.warning('%s: Broker does not support ProduceRequest v3+, required for transactional_id', str(self))
|
||||
return ProduceRequest[version](
|
||||
required_acks=acks,
|
||||
timeout=timeout,
|
||||
topics=topic_partition_data,
|
||||
)
|
||||
version = 0
|
||||
return ProduceRequest[version](
|
||||
required_acks=acks,
|
||||
timeout=timeout,
|
||||
topics=[(topic, list(partition_info.items()))
|
||||
for topic, partition_info
|
||||
in six.iteritems(produce_records_by_partition)],
|
||||
**kwargs
|
||||
)
|
||||
|
||||
def wakeup(self):
|
||||
"""Wake up the selector associated with this send thread."""
|
||||
@@ -589,9 +335,6 @@ class Sender(threading.Thread):
|
||||
def bootstrap_connected(self):
|
||||
return self._client.bootstrap_connected()
|
||||
|
||||
def __str__(self):
|
||||
return "<Sender client_id=%s transactional_id=%s>" % (self.config['client_id'], self.config['transactional_id'])
|
||||
|
||||
|
||||
class SenderMetrics(object):
|
||||
|
||||
@@ -624,6 +367,15 @@ class SenderMetrics(object):
|
||||
sensor_name=sensor_name,
|
||||
description='The maximum time in ms record batches spent in the record accumulator.')
|
||||
|
||||
sensor_name = 'produce-throttle-time'
|
||||
self.produce_throttle_time_sensor = self.metrics.sensor(sensor_name)
|
||||
self.add_metric('produce-throttle-time-avg', Avg(),
|
||||
sensor_name=sensor_name,
|
||||
description='The average throttle time in ms')
|
||||
self.add_metric('produce-throttle-time-max', Max(),
|
||||
sensor_name=sensor_name,
|
||||
description='The maximum throttle time in ms')
|
||||
|
||||
sensor_name = 'records-per-request'
|
||||
self.records_per_request_sensor = self.metrics.sensor(sensor_name)
|
||||
self.add_metric('record-send-rate', Rate(),
|
||||
@@ -746,9 +498,8 @@ class SenderMetrics(object):
|
||||
records += batch.record_count
|
||||
total_bytes += batch.records.size_in_bytes()
|
||||
|
||||
if node_batch:
|
||||
self.records_per_request_sensor.record(records)
|
||||
self.byte_rate_sensor.record(total_bytes)
|
||||
self.records_per_request_sensor.record(records)
|
||||
self.byte_rate_sensor.record(total_bytes)
|
||||
|
||||
def record_retries(self, topic, count):
|
||||
self.retry_sensor.record(count)
|
||||
@@ -761,3 +512,6 @@ class SenderMetrics(object):
|
||||
sensor = self.metrics.get_sensor('topic.' + topic + '.record-errors')
|
||||
if sensor:
|
||||
sensor.record(count)
|
||||
|
||||
def record_throttle_time(self, throttle_time_ms, node=None):
|
||||
self.produce_throttle_time_sensor.record(throttle_time_ms)
|
||||
|
||||
@@ -1,981 +0,0 @@
|
||||
from __future__ import absolute_import, division
|
||||
|
||||
import abc
|
||||
import collections
|
||||
import heapq
|
||||
import logging
|
||||
import threading
|
||||
|
||||
from kafka.vendor import six
|
||||
|
||||
try:
|
||||
# enum in stdlib as of py3.4
|
||||
from enum import IntEnum # pylint: disable=import-error
|
||||
except ImportError:
|
||||
# vendored backport module
|
||||
from kafka.vendor.enum34 import IntEnum
|
||||
|
||||
import kafka.errors as Errors
|
||||
from kafka.protocol.add_offsets_to_txn import AddOffsetsToTxnRequest
|
||||
from kafka.protocol.add_partitions_to_txn import AddPartitionsToTxnRequest
|
||||
from kafka.protocol.end_txn import EndTxnRequest
|
||||
from kafka.protocol.find_coordinator import FindCoordinatorRequest
|
||||
from kafka.protocol.init_producer_id import InitProducerIdRequest
|
||||
from kafka.protocol.txn_offset_commit import TxnOffsetCommitRequest
|
||||
from kafka.structs import TopicPartition
|
||||
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
NO_PRODUCER_ID = -1
|
||||
NO_PRODUCER_EPOCH = -1
|
||||
NO_SEQUENCE = -1
|
||||
|
||||
|
||||
class ProducerIdAndEpoch(object):
|
||||
__slots__ = ('producer_id', 'epoch')
|
||||
|
||||
def __init__(self, producer_id, epoch):
|
||||
self.producer_id = producer_id
|
||||
self.epoch = epoch
|
||||
|
||||
@property
|
||||
def is_valid(self):
|
||||
return NO_PRODUCER_ID < self.producer_id
|
||||
|
||||
def match(self, batch):
|
||||
return self.producer_id == batch.producer_id and self.epoch == batch.producer_epoch
|
||||
|
||||
def __eq__(self, other):
|
||||
return isinstance(other, ProducerIdAndEpoch) and self.producer_id == other.producer_id and self.epoch == other.epoch
|
||||
|
||||
def __str__(self):
|
||||
return "ProducerIdAndEpoch(producer_id={}, epoch={})".format(self.producer_id, self.epoch)
|
||||
|
||||
|
||||
class TransactionState(IntEnum):
|
||||
UNINITIALIZED = 0
|
||||
INITIALIZING = 1
|
||||
READY = 2
|
||||
IN_TRANSACTION = 3
|
||||
COMMITTING_TRANSACTION = 4
|
||||
ABORTING_TRANSACTION = 5
|
||||
ABORTABLE_ERROR = 6
|
||||
FATAL_ERROR = 7
|
||||
|
||||
@classmethod
|
||||
def is_transition_valid(cls, source, target):
|
||||
if target == cls.INITIALIZING:
|
||||
return source == cls.UNINITIALIZED
|
||||
elif target == cls.READY:
|
||||
return source in (cls.INITIALIZING, cls.COMMITTING_TRANSACTION, cls.ABORTING_TRANSACTION)
|
||||
elif target == cls.IN_TRANSACTION:
|
||||
return source == cls.READY
|
||||
elif target == cls.COMMITTING_TRANSACTION:
|
||||
return source == cls.IN_TRANSACTION
|
||||
elif target == cls.ABORTING_TRANSACTION:
|
||||
return source in (cls.IN_TRANSACTION, cls.ABORTABLE_ERROR)
|
||||
elif target == cls.ABORTABLE_ERROR:
|
||||
return source in (cls.IN_TRANSACTION, cls.COMMITTING_TRANSACTION, cls.ABORTABLE_ERROR)
|
||||
elif target == cls.UNINITIALIZED:
|
||||
# Disallow transitions to UNITIALIZED
|
||||
return False
|
||||
elif target == cls.FATAL_ERROR:
|
||||
# We can transition to FATAL_ERROR unconditionally.
|
||||
# FATAL_ERROR is never a valid starting state for any transition. So the only option is to close the
|
||||
# producer or do purely non transactional requests.
|
||||
return True
|
||||
|
||||
|
||||
class Priority(IntEnum):
|
||||
# We use the priority to determine the order in which requests need to be sent out. For instance, if we have
|
||||
# a pending FindCoordinator request, that must always go first. Next, If we need a producer id, that must go second.
|
||||
# The endTxn request must always go last.
|
||||
FIND_COORDINATOR = 0
|
||||
INIT_PRODUCER_ID = 1
|
||||
ADD_PARTITIONS_OR_OFFSETS = 2
|
||||
END_TXN = 3
|
||||
|
||||
|
||||
class TransactionManager(object):
|
||||
"""
|
||||
A class which maintains state for transactions. Also keeps the state necessary to ensure idempotent production.
|
||||
"""
|
||||
NO_INFLIGHT_REQUEST_CORRELATION_ID = -1
|
||||
# The retry_backoff_ms is overridden to the following value if the first AddPartitions receives a
|
||||
# CONCURRENT_TRANSACTIONS error.
|
||||
ADD_PARTITIONS_RETRY_BACKOFF_MS = 20
|
||||
|
||||
def __init__(self, transactional_id=None, transaction_timeout_ms=0, retry_backoff_ms=100, api_version=(0, 11), metadata=None):
|
||||
self._api_version = api_version
|
||||
self._metadata = metadata
|
||||
|
||||
self._sequence_numbers = collections.defaultdict(lambda: 0)
|
||||
|
||||
self.transactional_id = transactional_id
|
||||
self.transaction_timeout_ms = transaction_timeout_ms
|
||||
self._transaction_coordinator = None
|
||||
self._consumer_group_coordinator = None
|
||||
self._new_partitions_in_transaction = set()
|
||||
self._pending_partitions_in_transaction = set()
|
||||
self._partitions_in_transaction = set()
|
||||
self._pending_txn_offset_commits = dict()
|
||||
|
||||
self._current_state = TransactionState.UNINITIALIZED
|
||||
self._last_error = None
|
||||
self.producer_id_and_epoch = ProducerIdAndEpoch(NO_PRODUCER_ID, NO_PRODUCER_EPOCH)
|
||||
|
||||
self._transaction_started = False
|
||||
|
||||
self._pending_requests = [] # priority queue via heapq
|
||||
self._pending_requests_sort_id = 0
|
||||
self._in_flight_request_correlation_id = self.NO_INFLIGHT_REQUEST_CORRELATION_ID
|
||||
|
||||
# This is used by the TxnRequestHandlers to control how long to back off before a given request is retried.
|
||||
# For instance, this value is lowered by the AddPartitionsToTxnHandler when it receives a CONCURRENT_TRANSACTIONS
|
||||
# error for the first AddPartitionsRequest in a transaction.
|
||||
self.retry_backoff_ms = retry_backoff_ms
|
||||
self._lock = threading.Condition()
|
||||
|
||||
def initialize_transactions(self):
|
||||
with self._lock:
|
||||
self._ensure_transactional()
|
||||
self._transition_to(TransactionState.INITIALIZING)
|
||||
self.set_producer_id_and_epoch(ProducerIdAndEpoch(NO_PRODUCER_ID, NO_PRODUCER_EPOCH))
|
||||
self._sequence_numbers.clear()
|
||||
handler = InitProducerIdHandler(self, self.transaction_timeout_ms)
|
||||
self._enqueue_request(handler)
|
||||
return handler.result
|
||||
|
||||
def begin_transaction(self):
|
||||
with self._lock:
|
||||
self._ensure_transactional()
|
||||
self._maybe_fail_with_error()
|
||||
self._transition_to(TransactionState.IN_TRANSACTION)
|
||||
|
||||
def begin_commit(self):
|
||||
with self._lock:
|
||||
self._ensure_transactional()
|
||||
self._maybe_fail_with_error()
|
||||
self._transition_to(TransactionState.COMMITTING_TRANSACTION)
|
||||
return self._begin_completing_transaction(True)
|
||||
|
||||
def begin_abort(self):
|
||||
with self._lock:
|
||||
self._ensure_transactional()
|
||||
if self._current_state != TransactionState.ABORTABLE_ERROR:
|
||||
self._maybe_fail_with_error()
|
||||
self._transition_to(TransactionState.ABORTING_TRANSACTION)
|
||||
|
||||
# We're aborting the transaction, so there should be no need to add new partitions
|
||||
self._new_partitions_in_transaction.clear()
|
||||
return self._begin_completing_transaction(False)
|
||||
|
||||
def _begin_completing_transaction(self, committed):
|
||||
if self._new_partitions_in_transaction:
|
||||
self._enqueue_request(self._add_partitions_to_transaction_handler())
|
||||
handler = EndTxnHandler(self, committed)
|
||||
self._enqueue_request(handler)
|
||||
return handler.result
|
||||
|
||||
def send_offsets_to_transaction(self, offsets, consumer_group_id):
|
||||
with self._lock:
|
||||
self._ensure_transactional()
|
||||
self._maybe_fail_with_error()
|
||||
if self._current_state != TransactionState.IN_TRANSACTION:
|
||||
raise Errors.KafkaError("Cannot send offsets to transaction because the producer is not in an active transaction")
|
||||
|
||||
log.debug("Begin adding offsets %s for consumer group %s to transaction", offsets, consumer_group_id)
|
||||
handler = AddOffsetsToTxnHandler(self, consumer_group_id, offsets)
|
||||
self._enqueue_request(handler)
|
||||
return handler.result
|
||||
|
||||
def maybe_add_partition_to_transaction(self, topic_partition):
|
||||
with self._lock:
|
||||
self._fail_if_not_ready_for_send()
|
||||
|
||||
if self.is_partition_added(topic_partition) or self.is_partition_pending_add(topic_partition):
|
||||
return
|
||||
|
||||
log.debug("Begin adding new partition %s to transaction", topic_partition)
|
||||
self._new_partitions_in_transaction.add(topic_partition)
|
||||
|
||||
def _fail_if_not_ready_for_send(self):
|
||||
with self._lock:
|
||||
if self.has_error():
|
||||
raise Errors.KafkaError(
|
||||
"Cannot perform send because at least one previous transactional or"
|
||||
" idempotent request has failed with errors.", self._last_error)
|
||||
|
||||
if self.is_transactional():
|
||||
if not self.has_producer_id():
|
||||
raise Errors.IllegalStateError(
|
||||
"Cannot perform a 'send' before completing a call to init_transactions"
|
||||
" when transactions are enabled.")
|
||||
|
||||
if self._current_state != TransactionState.IN_TRANSACTION:
|
||||
raise Errors.IllegalStateError("Cannot call send in state %s" % (self._current_state.name,))
|
||||
|
||||
def is_send_to_partition_allowed(self, tp):
|
||||
with self._lock:
|
||||
if self.has_fatal_error():
|
||||
return False
|
||||
return not self.is_transactional() or tp in self._partitions_in_transaction
|
||||
|
||||
def has_producer_id(self, producer_id=None):
|
||||
if producer_id is None:
|
||||
return self.producer_id_and_epoch.is_valid
|
||||
else:
|
||||
return self.producer_id_and_epoch.producer_id == producer_id
|
||||
|
||||
def is_transactional(self):
|
||||
return self.transactional_id is not None
|
||||
|
||||
def has_partitions_to_add(self):
|
||||
with self._lock:
|
||||
return bool(self._new_partitions_in_transaction) or bool(self._pending_partitions_in_transaction)
|
||||
|
||||
def is_completing(self):
|
||||
with self._lock:
|
||||
return self._current_state in (
|
||||
TransactionState.COMMITTING_TRANSACTION,
|
||||
TransactionState.ABORTING_TRANSACTION)
|
||||
|
||||
@property
|
||||
def last_error(self):
|
||||
return self._last_error
|
||||
|
||||
def has_error(self):
|
||||
with self._lock:
|
||||
return self._current_state in (
|
||||
TransactionState.ABORTABLE_ERROR,
|
||||
TransactionState.FATAL_ERROR)
|
||||
|
||||
def is_aborting(self):
|
||||
with self._lock:
|
||||
return self._current_state == TransactionState.ABORTING_TRANSACTION
|
||||
|
||||
def transition_to_abortable_error(self, exc):
|
||||
with self._lock:
|
||||
if self._current_state == TransactionState.ABORTING_TRANSACTION:
|
||||
log.debug("Skipping transition to abortable error state since the transaction is already being "
|
||||
" aborted. Underlying exception: %s", exc)
|
||||
return
|
||||
self._transition_to(TransactionState.ABORTABLE_ERROR, error=exc)
|
||||
|
||||
def transition_to_fatal_error(self, exc):
|
||||
with self._lock:
|
||||
self._transition_to(TransactionState.FATAL_ERROR, error=exc)
|
||||
|
||||
def is_partition_added(self, partition):
|
||||
with self._lock:
|
||||
return partition in self._partitions_in_transaction
|
||||
|
||||
def is_partition_pending_add(self, partition):
|
||||
return partition in self._new_partitions_in_transaction or partition in self._pending_partitions_in_transaction
|
||||
|
||||
def has_producer_id_and_epoch(self, producer_id, producer_epoch):
|
||||
return (
|
||||
self.producer_id_and_epoch.producer_id == producer_id and
|
||||
self.producer_id_and_epoch.epoch == producer_epoch
|
||||
)
|
||||
|
||||
def set_producer_id_and_epoch(self, producer_id_and_epoch):
|
||||
if not isinstance(producer_id_and_epoch, ProducerIdAndEpoch):
|
||||
raise TypeError("ProducerAndIdEpoch type required")
|
||||
log.info("ProducerId set to %s with epoch %s",
|
||||
producer_id_and_epoch.producer_id, producer_id_and_epoch.epoch)
|
||||
self.producer_id_and_epoch = producer_id_and_epoch
|
||||
|
||||
def reset_producer_id(self):
|
||||
"""
|
||||
This method is used when the producer needs to reset its internal state because of an irrecoverable exception
|
||||
from the broker.
|
||||
|
||||
We need to reset the producer id and associated state when we have sent a batch to the broker, but we either get
|
||||
a non-retriable exception or we run out of retries, or the batch expired in the producer queue after it was already
|
||||
sent to the broker.
|
||||
|
||||
In all of these cases, we don't know whether batch was actually committed on the broker, and hence whether the
|
||||
sequence number was actually updated. If we don't reset the producer state, we risk the chance that all future
|
||||
messages will return an OutOfOrderSequenceNumberError.
|
||||
|
||||
Note that we can't reset the producer state for the transactional producer as this would mean bumping the epoch
|
||||
for the same producer id. This might involve aborting the ongoing transaction during the initProducerIdRequest,
|
||||
and the user would not have any way of knowing this happened. So for the transactional producer,
|
||||
it's best to return the produce error to the user and let them abort the transaction and close the producer explicitly.
|
||||
"""
|
||||
with self._lock:
|
||||
if self.is_transactional():
|
||||
raise Errors.IllegalStateError(
|
||||
"Cannot reset producer state for a transactional producer."
|
||||
" You must either abort the ongoing transaction or"
|
||||
" reinitialize the transactional producer instead")
|
||||
self.set_producer_id_and_epoch(ProducerIdAndEpoch(NO_PRODUCER_ID, NO_PRODUCER_EPOCH))
|
||||
self._sequence_numbers.clear()
|
||||
|
||||
def sequence_number(self, tp):
|
||||
with self._lock:
|
||||
return self._sequence_numbers[tp]
|
||||
|
||||
def increment_sequence_number(self, tp, increment):
|
||||
with self._lock:
|
||||
if tp not in self._sequence_numbers:
|
||||
raise Errors.IllegalStateError("Attempt to increment sequence number for a partition with no current sequence.")
|
||||
# Sequence number wraps at java max int
|
||||
base = self._sequence_numbers[tp]
|
||||
if base > (2147483647 - increment):
|
||||
self._sequence_numbers[tp] = increment - (2147483647 - base) - 1
|
||||
else:
|
||||
self._sequence_numbers[tp] += increment
|
||||
|
||||
def next_request_handler(self, has_incomplete_batches):
|
||||
with self._lock:
|
||||
if self._new_partitions_in_transaction:
|
||||
self._enqueue_request(self._add_partitions_to_transaction_handler())
|
||||
|
||||
if not self._pending_requests:
|
||||
return None
|
||||
|
||||
_, _, next_request_handler = self._pending_requests[0]
|
||||
# Do not send the EndTxn until all batches have been flushed
|
||||
if isinstance(next_request_handler, EndTxnHandler) and has_incomplete_batches:
|
||||
return None
|
||||
|
||||
heapq.heappop(self._pending_requests)
|
||||
if self._maybe_terminate_request_with_error(next_request_handler):
|
||||
log.debug("Not sending transactional request %s because we are in an error state",
|
||||
next_request_handler.request)
|
||||
return None
|
||||
|
||||
if isinstance(next_request_handler, EndTxnHandler) and not self._transaction_started:
|
||||
next_request_handler.result.done()
|
||||
if self._current_state != TransactionState.FATAL_ERROR:
|
||||
log.debug("Not sending EndTxn for completed transaction since no partitions"
|
||||
" or offsets were successfully added")
|
||||
self._complete_transaction()
|
||||
try:
|
||||
_, _, next_request_handler = heapq.heappop(self._pending_requests)
|
||||
except IndexError:
|
||||
next_request_handler = None
|
||||
|
||||
if next_request_handler:
|
||||
log.debug("Request %s dequeued for sending", next_request_handler.request)
|
||||
|
||||
return next_request_handler
|
||||
|
||||
def retry(self, request):
|
||||
with self._lock:
|
||||
request.set_retry()
|
||||
self._enqueue_request(request)
|
||||
|
||||
def authentication_failed(self, exc):
|
||||
with self._lock:
|
||||
for _, _, request in self._pending_requests:
|
||||
request.fatal_error(exc)
|
||||
|
||||
def coordinator(self, coord_type):
|
||||
if coord_type == 'group':
|
||||
return self._consumer_group_coordinator
|
||||
elif coord_type == 'transaction':
|
||||
return self._transaction_coordinator
|
||||
else:
|
||||
raise Errors.IllegalStateError("Received an invalid coordinator type: %s" % (coord_type,))
|
||||
|
||||
def lookup_coordinator_for_request(self, request):
|
||||
self._lookup_coordinator(request.coordinator_type, request.coordinator_key)
|
||||
|
||||
def next_in_flight_request_correlation_id(self):
|
||||
self._in_flight_request_correlation_id += 1
|
||||
return self._in_flight_request_correlation_id
|
||||
|
||||
def clear_in_flight_transactional_request_correlation_id(self):
|
||||
self._in_flight_request_correlation_id = self.NO_INFLIGHT_REQUEST_CORRELATION_ID
|
||||
|
||||
def has_in_flight_transactional_request(self):
|
||||
return self._in_flight_request_correlation_id != self.NO_INFLIGHT_REQUEST_CORRELATION_ID
|
||||
|
||||
def has_fatal_error(self):
|
||||
return self._current_state == TransactionState.FATAL_ERROR
|
||||
|
||||
def has_abortable_error(self):
|
||||
return self._current_state == TransactionState.ABORTABLE_ERROR
|
||||
|
||||
# visible for testing
|
||||
def _test_transaction_contains_partition(self, tp):
|
||||
with self._lock:
|
||||
return tp in self._partitions_in_transaction
|
||||
|
||||
# visible for testing
|
||||
def _test_has_pending_offset_commits(self):
|
||||
return bool(self._pending_txn_offset_commits)
|
||||
|
||||
# visible for testing
|
||||
def _test_has_ongoing_transaction(self):
|
||||
with self._lock:
|
||||
# transactions are considered ongoing once started until completion or a fatal error
|
||||
return self._current_state == TransactionState.IN_TRANSACTION or self.is_completing() or self.has_abortable_error()
|
||||
|
||||
# visible for testing
|
||||
def _test_is_ready(self):
|
||||
with self._lock:
|
||||
return self.is_transactional() and self._current_state == TransactionState.READY
|
||||
|
||||
def _transition_to(self, target, error=None):
|
||||
with self._lock:
|
||||
if not self._current_state.is_transition_valid(self._current_state, target):
|
||||
raise Errors.KafkaError("TransactionalId %s: Invalid transition attempted from state %s to state %s" % (
|
||||
self.transactional_id, self._current_state.name, target.name))
|
||||
|
||||
if target in (TransactionState.FATAL_ERROR, TransactionState.ABORTABLE_ERROR):
|
||||
if error is None:
|
||||
raise Errors.IllegalArgumentError("Cannot transition to %s with an None exception" % (target.name,))
|
||||
self._last_error = error
|
||||
else:
|
||||
self._last_error = None
|
||||
|
||||
if self._last_error is not None:
|
||||
log.debug("Transition from state %s to error state %s (%s)", self._current_state.name, target.name, self._last_error)
|
||||
else:
|
||||
log.debug("Transition from state %s to %s", self._current_state, target)
|
||||
self._current_state = target
|
||||
|
||||
def _ensure_transactional(self):
|
||||
if not self.is_transactional():
|
||||
raise Errors.IllegalStateError("Transactional method invoked on a non-transactional producer.")
|
||||
|
||||
def _maybe_fail_with_error(self):
|
||||
if self.has_error():
|
||||
raise Errors.KafkaError("Cannot execute transactional method because we are in an error state: %s" % (self._last_error,))
|
||||
|
||||
def _maybe_terminate_request_with_error(self, request_handler):
|
||||
if self.has_error():
|
||||
if self.has_abortable_error() and isinstance(request_handler, FindCoordinatorHandler):
|
||||
# No harm letting the FindCoordinator request go through if we're expecting to abort
|
||||
return False
|
||||
request_handler.fail(self._last_error)
|
||||
return True
|
||||
return False
|
||||
|
||||
def _next_pending_requests_sort_id(self):
|
||||
self._pending_requests_sort_id += 1
|
||||
return self._pending_requests_sort_id
|
||||
|
||||
def _enqueue_request(self, request_handler):
|
||||
log.debug("Enqueuing transactional request %s", request_handler.request)
|
||||
heapq.heappush(
|
||||
self._pending_requests,
|
||||
(
|
||||
request_handler.priority, # keep lowest priority at head of queue
|
||||
self._next_pending_requests_sort_id(), # break ties
|
||||
request_handler
|
||||
)
|
||||
)
|
||||
|
||||
def _lookup_coordinator(self, coord_type, coord_key):
|
||||
with self._lock:
|
||||
if coord_type == 'group':
|
||||
self._consumer_group_coordinator = None
|
||||
elif coord_type == 'transaction':
|
||||
self._transaction_coordinator = None
|
||||
else:
|
||||
raise Errors.IllegalStateError("Invalid coordinator type: %s" % (coord_type,))
|
||||
self._enqueue_request(FindCoordinatorHandler(self, coord_type, coord_key))
|
||||
|
||||
def _complete_transaction(self):
|
||||
with self._lock:
|
||||
self._transition_to(TransactionState.READY)
|
||||
self._transaction_started = False
|
||||
self._new_partitions_in_transaction.clear()
|
||||
self._pending_partitions_in_transaction.clear()
|
||||
self._partitions_in_transaction.clear()
|
||||
|
||||
def _add_partitions_to_transaction_handler(self):
|
||||
with self._lock:
|
||||
self._pending_partitions_in_transaction.update(self._new_partitions_in_transaction)
|
||||
self._new_partitions_in_transaction.clear()
|
||||
return AddPartitionsToTxnHandler(self, self._pending_partitions_in_transaction)
|
||||
|
||||
|
||||
class TransactionalRequestResult(object):
|
||||
def __init__(self):
|
||||
self._latch = threading.Event()
|
||||
self._error = None
|
||||
|
||||
def done(self, error=None):
|
||||
self._error = error
|
||||
self._latch.set()
|
||||
|
||||
def wait(self, timeout_ms=None):
|
||||
timeout = timeout_ms / 1000 if timeout_ms is not None else None
|
||||
success = self._latch.wait(timeout)
|
||||
if self._error:
|
||||
raise self._error
|
||||
return success
|
||||
|
||||
@property
|
||||
def is_done(self):
|
||||
return self._latch.is_set()
|
||||
|
||||
@property
|
||||
def succeeded(self):
|
||||
return self._latch.is_set() and self._error is None
|
||||
|
||||
@property
|
||||
def failed(self):
|
||||
return self._latch.is_set() and self._error is not None
|
||||
|
||||
@property
|
||||
def exception(self):
|
||||
return self._error
|
||||
|
||||
|
||||
@six.add_metaclass(abc.ABCMeta)
|
||||
class TxnRequestHandler(object):
|
||||
def __init__(self, transaction_manager, result=None):
|
||||
self.transaction_manager = transaction_manager
|
||||
self.retry_backoff_ms = transaction_manager.retry_backoff_ms
|
||||
self.request = None
|
||||
self._result = result or TransactionalRequestResult()
|
||||
self._is_retry = False
|
||||
|
||||
@property
|
||||
def transactional_id(self):
|
||||
return self.transaction_manager.transactional_id
|
||||
|
||||
@property
|
||||
def producer_id(self):
|
||||
return self.transaction_manager.producer_id_and_epoch.producer_id
|
||||
|
||||
@property
|
||||
def producer_epoch(self):
|
||||
return self.transaction_manager.producer_id_and_epoch.epoch
|
||||
|
||||
def fatal_error(self, exc):
|
||||
self.transaction_manager.transition_to_fatal_error(exc)
|
||||
self._result.done(error=exc)
|
||||
|
||||
def abortable_error(self, exc):
|
||||
self.transaction_manager.transition_to_abortable_error(exc)
|
||||
self._result.done(error=exc)
|
||||
|
||||
def fail(self, exc):
|
||||
self._result.done(error=exc)
|
||||
|
||||
def reenqueue(self):
|
||||
with self.transaction_manager._lock:
|
||||
self._is_retry = True
|
||||
self.transaction_manager._enqueue_request(self)
|
||||
|
||||
def on_complete(self, correlation_id, response_or_exc):
|
||||
if correlation_id != self.transaction_manager._in_flight_request_correlation_id:
|
||||
self.fatal_error(RuntimeError("Detected more than one in-flight transactional request."))
|
||||
else:
|
||||
self.transaction_manager.clear_in_flight_transactional_request_correlation_id()
|
||||
if isinstance(response_or_exc, Errors.KafkaConnectionError):
|
||||
log.debug("Disconnected from node. Will retry.")
|
||||
if self.needs_coordinator():
|
||||
self.transaction_manager._lookup_coordinator(self.coordinator_type, self.coordinator_key)
|
||||
self.reenqueue()
|
||||
elif isinstance(response_or_exc, Errors.UnsupportedVersionError):
|
||||
self.fatal_error(response_or_exc)
|
||||
elif not isinstance(response_or_exc, (Exception, type(None))):
|
||||
log.debug("Received transactional response %s for request %s", response_or_exc, self.request)
|
||||
with self.transaction_manager._lock:
|
||||
self.handle_response(response_or_exc)
|
||||
else:
|
||||
self.fatal_error(Errors.KafkaError("Could not execute transactional request for unknown reasons: %s" % response_or_exc))
|
||||
|
||||
def needs_coordinator(self):
|
||||
return self.coordinator_type is not None
|
||||
|
||||
@property
|
||||
def result(self):
|
||||
return self._result
|
||||
|
||||
@property
|
||||
def coordinator_type(self):
|
||||
return 'transaction'
|
||||
|
||||
@property
|
||||
def coordinator_key(self):
|
||||
return self.transaction_manager.transactional_id
|
||||
|
||||
def set_retry(self):
|
||||
self._is_retry = True
|
||||
|
||||
@property
|
||||
def is_retry(self):
|
||||
return self._is_retry
|
||||
|
||||
@abc.abstractmethod
|
||||
def handle_response(self, response):
|
||||
pass
|
||||
|
||||
@abc.abstractproperty
|
||||
def priority(self):
|
||||
pass
|
||||
|
||||
|
||||
class InitProducerIdHandler(TxnRequestHandler):
|
||||
def __init__(self, transaction_manager, transaction_timeout_ms):
|
||||
super(InitProducerIdHandler, self).__init__(transaction_manager)
|
||||
|
||||
if transaction_manager._api_version >= (2, 0):
|
||||
version = 1
|
||||
else:
|
||||
version = 0
|
||||
self.request = InitProducerIdRequest[version](
|
||||
transactional_id=self.transactional_id,
|
||||
transaction_timeout_ms=transaction_timeout_ms)
|
||||
|
||||
@property
|
||||
def priority(self):
|
||||
return Priority.INIT_PRODUCER_ID
|
||||
|
||||
def handle_response(self, response):
|
||||
error = Errors.for_code(response.error_code)
|
||||
|
||||
if error is Errors.NoError:
|
||||
self.transaction_manager.set_producer_id_and_epoch(ProducerIdAndEpoch(response.producer_id, response.producer_epoch))
|
||||
self.transaction_manager._transition_to(TransactionState.READY)
|
||||
self._result.done()
|
||||
elif error in (Errors.NotCoordinatorError, Errors.CoordinatorNotAvailableError):
|
||||
self.transaction_manager._lookup_coordinator('transaction', self.transactional_id)
|
||||
self.reenqueue()
|
||||
elif error in (Errors.CoordinatorLoadInProgressError, Errors.ConcurrentTransactionsError):
|
||||
self.reenqueue()
|
||||
elif error is Errors.TransactionalIdAuthorizationFailedError:
|
||||
self.fatal_error(error())
|
||||
else:
|
||||
self.fatal_error(Errors.KafkaError("Unexpected error in InitProducerIdResponse: %s" % (error())))
|
||||
|
||||
class AddPartitionsToTxnHandler(TxnRequestHandler):
|
||||
def __init__(self, transaction_manager, topic_partitions):
|
||||
super(AddPartitionsToTxnHandler, self).__init__(transaction_manager)
|
||||
|
||||
if transaction_manager._api_version >= (2, 7):
|
||||
version = 2
|
||||
elif transaction_manager._api_version >= (2, 0):
|
||||
version = 1
|
||||
else:
|
||||
version = 0
|
||||
topic_data = collections.defaultdict(list)
|
||||
for tp in topic_partitions:
|
||||
topic_data[tp.topic].append(tp.partition)
|
||||
self.request = AddPartitionsToTxnRequest[version](
|
||||
transactional_id=self.transactional_id,
|
||||
producer_id=self.producer_id,
|
||||
producer_epoch=self.producer_epoch,
|
||||
topics=list(topic_data.items()))
|
||||
|
||||
@property
|
||||
def priority(self):
|
||||
return Priority.ADD_PARTITIONS_OR_OFFSETS
|
||||
|
||||
def handle_response(self, response):
|
||||
has_partition_errors = False
|
||||
unauthorized_topics = set()
|
||||
self.retry_backoff_ms = self.transaction_manager.retry_backoff_ms
|
||||
|
||||
results = {TopicPartition(topic, partition): Errors.for_code(error_code)
|
||||
for topic, partition_data in response.results
|
||||
for partition, error_code in partition_data}
|
||||
|
||||
for tp, error in six.iteritems(results):
|
||||
if error is Errors.NoError:
|
||||
continue
|
||||
elif error in (Errors.CoordinatorNotAvailableError, Errors.NotCoordinatorError):
|
||||
self.transaction_manager._lookup_coordinator('transaction', self.transactional_id)
|
||||
self.reenqueue()
|
||||
return
|
||||
elif error is Errors.ConcurrentTransactionsError:
|
||||
self.maybe_override_retry_backoff_ms()
|
||||
self.reenqueue()
|
||||
return
|
||||
elif error in (Errors.CoordinatorLoadInProgressError, Errors.UnknownTopicOrPartitionError):
|
||||
self.reenqueue()
|
||||
return
|
||||
elif error is Errors.InvalidProducerEpochError:
|
||||
self.fatal_error(error())
|
||||
return
|
||||
elif error is Errors.TransactionalIdAuthorizationFailedError:
|
||||
self.fatal_error(error())
|
||||
return
|
||||
elif error in (Errors.InvalidProducerIdMappingError, Errors.InvalidTxnStateError):
|
||||
self.fatal_error(Errors.KafkaError(error()))
|
||||
return
|
||||
elif error is Errors.TopicAuthorizationFailedError:
|
||||
unauthorized_topics.add(tp.topic)
|
||||
elif error is Errors.OperationNotAttemptedError:
|
||||
log.debug("Did not attempt to add partition %s to transaction because other partitions in the"
|
||||
" batch had errors.", tp)
|
||||
has_partition_errors = True
|
||||
else:
|
||||
log.error("Could not add partition %s due to unexpected error %s", tp, error())
|
||||
has_partition_errors = True
|
||||
|
||||
partitions = set(results)
|
||||
|
||||
# Remove the partitions from the pending set regardless of the result. We use the presence
|
||||
# of partitions in the pending set to know when it is not safe to send batches. However, if
|
||||
# the partitions failed to be added and we enter an error state, we expect the batches to be
|
||||
# aborted anyway. In this case, we must be able to continue sending the batches which are in
|
||||
# retry for partitions that were successfully added.
|
||||
self.transaction_manager._pending_partitions_in_transaction -= partitions
|
||||
|
||||
if unauthorized_topics:
|
||||
self.abortable_error(Errors.TopicAuthorizationFailedError(unauthorized_topics))
|
||||
elif has_partition_errors:
|
||||
self.abortable_error(Errors.KafkaError("Could not add partitions to transaction due to errors: %s" % (results)))
|
||||
else:
|
||||
log.debug("Successfully added partitions %s to transaction", partitions)
|
||||
self.transaction_manager._partitions_in_transaction.update(partitions)
|
||||
self.transaction_manager._transaction_started = True
|
||||
self._result.done()
|
||||
|
||||
def maybe_override_retry_backoff_ms(self):
|
||||
# We only want to reduce the backoff when retrying the first AddPartition which errored out due to a
|
||||
# CONCURRENT_TRANSACTIONS error since this means that the previous transaction is still completing and
|
||||
# we don't want to wait too long before trying to start the new one.
|
||||
#
|
||||
# This is only a temporary fix, the long term solution is being tracked in
|
||||
# https://issues.apache.org/jira/browse/KAFKA-5482
|
||||
if not self.transaction_manager._partitions_in_transaction:
|
||||
self.retry_backoff_ms = min(self.transaction_manager.ADD_PARTITIONS_RETRY_BACKOFF_MS, self.retry_backoff_ms)
|
||||
|
||||
|
||||
class FindCoordinatorHandler(TxnRequestHandler):
|
||||
def __init__(self, transaction_manager, coord_type, coord_key):
|
||||
super(FindCoordinatorHandler, self).__init__(transaction_manager)
|
||||
|
||||
self._coord_type = coord_type
|
||||
self._coord_key = coord_key
|
||||
if transaction_manager._api_version >= (2, 0):
|
||||
version = 2
|
||||
else:
|
||||
version = 1
|
||||
if coord_type == 'group':
|
||||
coord_type_int8 = 0
|
||||
elif coord_type == 'transaction':
|
||||
coord_type_int8 = 1
|
||||
else:
|
||||
raise ValueError("Unrecognized coordinator type: %s" % (coord_type,))
|
||||
self.request = FindCoordinatorRequest[version](
|
||||
coordinator_key=coord_key,
|
||||
coordinator_type=coord_type_int8,
|
||||
)
|
||||
|
||||
@property
|
||||
def priority(self):
|
||||
return Priority.FIND_COORDINATOR
|
||||
|
||||
@property
|
||||
def coordinator_type(self):
|
||||
return None
|
||||
|
||||
@property
|
||||
def coordinator_key(self):
|
||||
return None
|
||||
|
||||
def handle_response(self, response):
|
||||
error = Errors.for_code(response.error_code)
|
||||
|
||||
if error is Errors.NoError:
|
||||
coordinator_id = self.transaction_manager._metadata.add_coordinator(
|
||||
response, self._coord_type, self._coord_key)
|
||||
if self._coord_type == 'group':
|
||||
self.transaction_manager._consumer_group_coordinator = coordinator_id
|
||||
elif self._coord_type == 'transaction':
|
||||
self.transaction_manager._transaction_coordinator = coordinator_id
|
||||
self._result.done()
|
||||
elif error is Errors.CoordinatorNotAvailableError:
|
||||
self.reenqueue()
|
||||
elif error is Errors.TransactionalIdAuthorizationFailedError:
|
||||
self.fatal_error(error())
|
||||
elif error is Errors.GroupAuthorizationFailedError:
|
||||
self.abortable_error(error(self._coord_key))
|
||||
else:
|
||||
self.fatal_error(Errors.KafkaError(
|
||||
"Could not find a coordinator with type %s with key %s due to"
|
||||
" unexpected error: %s" % (self._coord_type, self._coord_key, error())))
|
||||
|
||||
|
||||
class EndTxnHandler(TxnRequestHandler):
|
||||
def __init__(self, transaction_manager, committed):
|
||||
super(EndTxnHandler, self).__init__(transaction_manager)
|
||||
|
||||
if self.transaction_manager._api_version >= (2, 7):
|
||||
version = 2
|
||||
elif self.transaction_manager._api_version >= (2, 0):
|
||||
version = 1
|
||||
else:
|
||||
version = 0
|
||||
self.request = EndTxnRequest[version](
|
||||
transactional_id=self.transactional_id,
|
||||
producer_id=self.producer_id,
|
||||
producer_epoch=self.producer_epoch,
|
||||
committed=committed)
|
||||
|
||||
@property
|
||||
def priority(self):
|
||||
return Priority.END_TXN
|
||||
|
||||
def handle_response(self, response):
|
||||
error = Errors.for_code(response.error_code)
|
||||
|
||||
if error is Errors.NoError:
|
||||
self.transaction_manager._complete_transaction()
|
||||
self._result.done()
|
||||
elif error in (Errors.CoordinatorNotAvailableError, Errors.NotCoordinatorError):
|
||||
self.transaction_manager._lookup_coordinator('transaction', self.transactional_id)
|
||||
self.reenqueue()
|
||||
elif error in (Errors.CoordinatorLoadInProgressError, Errors.ConcurrentTransactionsError):
|
||||
self.reenqueue()
|
||||
elif error is Errors.InvalidProducerEpochError:
|
||||
self.fatal_error(error())
|
||||
elif error is Errors.TransactionalIdAuthorizationFailedError:
|
||||
self.fatal_error(error())
|
||||
elif error is Errors.InvalidTxnStateError:
|
||||
self.fatal_error(error())
|
||||
else:
|
||||
self.fatal_error(Errors.KafkaError("Unhandled error in EndTxnResponse: %s" % (error())))
|
||||
|
||||
|
||||
class AddOffsetsToTxnHandler(TxnRequestHandler):
|
||||
def __init__(self, transaction_manager, consumer_group_id, offsets):
|
||||
super(AddOffsetsToTxnHandler, self).__init__(transaction_manager)
|
||||
|
||||
self.consumer_group_id = consumer_group_id
|
||||
self.offsets = offsets
|
||||
if self.transaction_manager._api_version >= (2, 7):
|
||||
version = 2
|
||||
elif self.transaction_manager._api_version >= (2, 0):
|
||||
version = 1
|
||||
else:
|
||||
version = 0
|
||||
self.request = AddOffsetsToTxnRequest[version](
|
||||
transactional_id=self.transactional_id,
|
||||
producer_id=self.producer_id,
|
||||
producer_epoch=self.producer_epoch,
|
||||
group_id=consumer_group_id)
|
||||
|
||||
@property
|
||||
def priority(self):
|
||||
return Priority.ADD_PARTITIONS_OR_OFFSETS
|
||||
|
||||
def handle_response(self, response):
|
||||
error = Errors.for_code(response.error_code)
|
||||
|
||||
if error is Errors.NoError:
|
||||
log.debug("Successfully added partition for consumer group %s to transaction", self.consumer_group_id)
|
||||
|
||||
# note the result is not completed until the TxnOffsetCommit returns
|
||||
for tp, offset in six.iteritems(self.offsets):
|
||||
self.transaction_manager._pending_txn_offset_commits[tp] = offset
|
||||
handler = TxnOffsetCommitHandler(self.transaction_manager, self.consumer_group_id,
|
||||
self.transaction_manager._pending_txn_offset_commits, self._result)
|
||||
self.transaction_manager._enqueue_request(handler)
|
||||
self.transaction_manager._transaction_started = True
|
||||
elif error in (Errors.CoordinatorNotAvailableError, Errors.NotCoordinatorError):
|
||||
self.transaction_manager._lookup_coordinator('transaction', self.transactional_id)
|
||||
self.reenqueue()
|
||||
elif error in (Errors.CoordinatorLoadInProgressError, Errors.ConcurrentTransactionsError):
|
||||
self.reenqueue()
|
||||
elif error is Errors.InvalidProducerEpochError:
|
||||
self.fatal_error(error())
|
||||
elif error is Errors.TransactionalIdAuthorizationFailedError:
|
||||
self.fatal_error(error())
|
||||
elif error is Errors.GroupAuthorizationFailedError:
|
||||
self.abortable_error(error(self.consumer_group_id))
|
||||
else:
|
||||
self.fatal_error(Errors.KafkaError("Unexpected error in AddOffsetsToTxnResponse: %s" % (error())))
|
||||
|
||||
|
||||
class TxnOffsetCommitHandler(TxnRequestHandler):
|
||||
def __init__(self, transaction_manager, consumer_group_id, offsets, result):
|
||||
super(TxnOffsetCommitHandler, self).__init__(transaction_manager, result=result)
|
||||
|
||||
self.consumer_group_id = consumer_group_id
|
||||
self.offsets = offsets
|
||||
self.request = self._build_request()
|
||||
|
||||
def _build_request(self):
|
||||
if self.transaction_manager._api_version >= (2, 1):
|
||||
version = 2
|
||||
elif self.transaction_manager._api_version >= (2, 0):
|
||||
version = 1
|
||||
else:
|
||||
version = 0
|
||||
|
||||
topic_data = collections.defaultdict(list)
|
||||
for tp, offset in six.iteritems(self.offsets):
|
||||
if version >= 2:
|
||||
partition_data = (tp.partition, offset.offset, offset.leader_epoch, offset.metadata)
|
||||
else:
|
||||
partition_data = (tp.partition, offset.offset, offset.metadata)
|
||||
topic_data[tp.topic].append(partition_data)
|
||||
|
||||
return TxnOffsetCommitRequest[version](
|
||||
transactional_id=self.transactional_id,
|
||||
group_id=self.consumer_group_id,
|
||||
producer_id=self.producer_id,
|
||||
producer_epoch=self.producer_epoch,
|
||||
topics=list(topic_data.items()))
|
||||
|
||||
@property
|
||||
def priority(self):
|
||||
return Priority.ADD_PARTITIONS_OR_OFFSETS
|
||||
|
||||
@property
|
||||
def coordinator_type(self):
|
||||
return 'group'
|
||||
|
||||
@property
|
||||
def coordinator_key(self):
|
||||
return self.consumer_group_id
|
||||
|
||||
def handle_response(self, response):
|
||||
lookup_coordinator = False
|
||||
retriable_failure = False
|
||||
|
||||
errors = {TopicPartition(topic, partition): Errors.for_code(error_code)
|
||||
for topic, partition_data in response.topics
|
||||
for partition, error_code in partition_data}
|
||||
|
||||
for tp, error in six.iteritems(errors):
|
||||
if error is Errors.NoError:
|
||||
log.debug("Successfully added offsets for %s from consumer group %s to transaction.",
|
||||
tp, self.consumer_group_id)
|
||||
del self.transaction_manager._pending_txn_offset_commits[tp]
|
||||
elif error in (errors.CoordinatorNotAvailableError, Errors.NotCoordinatorError, Errors.RequestTimedOutError):
|
||||
retriable_failure = True
|
||||
lookup_coordinator = True
|
||||
elif error is Errors.UnknownTopicOrPartitionError:
|
||||
retriable_failure = True
|
||||
elif error is Errors.GroupAuthorizationFailedError:
|
||||
self.abortable_error(error(self.consumer_group_id))
|
||||
return
|
||||
elif error in (Errors.TransactionalIdAuthorizationFailedError,
|
||||
Errors.InvalidProducerEpochError,
|
||||
Errors.UnsupportedForMessageFormatError):
|
||||
self.fatal_error(error())
|
||||
return
|
||||
else:
|
||||
self.fatal_error(Errors.KafkaError("Unexpected error in TxnOffsetCommitResponse: %s" % (error())))
|
||||
return
|
||||
|
||||
if lookup_coordinator:
|
||||
self.transaction_manager._lookup_coordinator('group', self.consumer_group_id)
|
||||
|
||||
if not retriable_failure:
|
||||
# all attempted partitions were either successful, or there was a fatal failure.
|
||||
# either way, we are not retrying, so complete the request.
|
||||
self.result.done()
|
||||
|
||||
# retry the commits which failed with a retriable error.
|
||||
elif self.transaction_manager._pending_txn_offset_commits:
|
||||
self.offsets = self.transaction_manager._pending_txn_offset_commits
|
||||
self.request = self._build_request()
|
||||
self.reenqueue()
|
||||
Reference in New Issue
Block a user