main commit
All checks were successful
continuous-integration/drone/push Build is passing

This commit is contained in:
2025-10-16 16:30:25 +09:00
parent 91c7e04474
commit 537e7b363f
1146 changed files with 45926 additions and 77196 deletions

View File

@@ -1,11 +1,11 @@
from __future__ import absolute_import, division
from __future__ import absolute_import
import atexit
import copy
import logging
import socket
import threading
import warnings
import time
import weakref
from kafka.vendor import six
@@ -18,12 +18,10 @@ from kafka.partitioner.default import DefaultPartitioner
from kafka.producer.future import FutureRecordMetadata, FutureProduceResult
from kafka.producer.record_accumulator import AtomicInteger, RecordAccumulator
from kafka.producer.sender import Sender
from kafka.producer.transaction_manager import TransactionManager
from kafka.record.default_records import DefaultRecordBatchBuilder
from kafka.record.legacy_records import LegacyRecordBatchBuilder
from kafka.serializer import Serializer
from kafka.structs import TopicPartition
from kafka.util import Timer, ensure_valid_topic_name
log = logging.getLogger(__name__)
@@ -36,8 +34,8 @@ class KafkaProducer(object):
The producer is thread safe and sharing a single producer instance across
threads will generally be faster than having multiple instances.
The producer consists of a RecordAccumulator which holds records that
haven't yet been transmitted to the server, and a Sender background I/O
The producer consists of a pool of buffer space that holds records that
haven't yet been transmitted to the server as well as a background I/O
thread that is responsible for turning these records into requests and
transmitting them to the cluster.
@@ -73,50 +71,14 @@ class KafkaProducer(object):
can lead to fewer, more efficient requests when not under maximal load at
the cost of a small amount of latency.
The buffer_memory controls the total amount of memory available to the
producer for buffering. If records are sent faster than they can be
transmitted to the server then this buffer space will be exhausted. When
the buffer space is exhausted additional send calls will block.
The key_serializer and value_serializer instruct how to turn the key and
value objects the user provides into bytes.
From Kafka 0.11, the KafkaProducer supports two additional modes:
the idempotent producer and the transactional producer.
The idempotent producer strengthens Kafka's delivery semantics from
at least once to exactly once delivery. In particular, producer retries
will no longer introduce duplicates. The transactional producer allows an
application to send messages to multiple partitions (and topics!)
atomically.
To enable idempotence, the `enable_idempotence` configuration must be set
to True. If set, the `retries` config will default to `float('inf')` and
the `acks` config will default to 'all'. There are no API changes for the
idempotent producer, so existing applications will not need to be modified
to take advantage of this feature.
To take advantage of the idempotent producer, it is imperative to avoid
application level re-sends since these cannot be de-duplicated. As such, if
an application enables idempotence, it is recommended to leave the
`retries` config unset, as it will be defaulted to `float('inf')`.
Additionally, if a :meth:`~kafka.KafkaProducer.send` returns an error even
with infinite retries (for instance if the message expires in the buffer
before being sent), then it is recommended to shut down the producer and
check the contents of the last produced message to ensure that it is not
duplicated. Finally, the producer can only guarantee idempotence for
messages sent within a single session.
To use the transactional producer and the attendant APIs, you must set the
`transactional_id` configuration property. If the `transactional_id` is
set, idempotence is automatically enabled along with the producer configs
which idempotence depends on. Further, topics which are included in
transactions should be configured for durability. In particular, the
`replication.factor` should be at least `3`, and the `min.insync.replicas`
for these topics should be set to 2. Finally, in order for transactional
guarantees to be realized from end-to-end, the consumers must be
configured to read only committed messages as well.
The purpose of the `transactional_id` is to enable transaction recovery
across multiple sessions of a single producer instance. It would typically
be derived from the shard identifier in a partitioned, stateful,
application. As such, it should be unique to each producer instance running
within a partitioned application.
Keyword Arguments:
bootstrap_servers: 'host[:port]' string (or list of 'host[:port]'
strings) that the producer should contact to bootstrap initial
@@ -134,28 +96,6 @@ class KafkaProducer(object):
value_serializer (callable): used to convert user-supplied message
values to bytes. If not None, called as f(value), should return
bytes. Default: None.
enable_idempotence (bool): When set to True, the producer will ensure
that exactly one copy of each message is written in the stream.
If False, producer retries due to broker failures, etc., may write
duplicates of the retried message in the stream. Default: False.
Note that enabling idempotence requires
`max_in_flight_requests_per_connection` to be set to 1 and `retries`
cannot be zero. Additionally, `acks` must be set to 'all'. If these
values are left at their defaults, the producer will override the
defaults to be suitable. If the values are set to something
incompatible with the idempotent producer, a KafkaConfigurationError
will be raised.
delivery_timeout_ms (float): An upper bound on the time to report success
or failure after producer.send() returns. This limits the total time
that a record will be delayed prior to sending, the time to await
acknowledgement from the broker (if expected), and the time allowed
for retriable send failures. The producer may report failure to send
a record earlier than this config if either an unrecoverable error is
encountered, the retries have been exhausted, or the record is added
to a batch which reached an earlier delivery expiration deadline.
The value of this config should be greater than or equal to the
sum of (request_timeout_ms + linger_ms). Default: 120000.
acks (0, 1, 'all'): The number of acknowledgments the producer requires
the leader to have received before considering a request complete.
This controls the durability of records that are sent. The
@@ -183,7 +123,7 @@ class KafkaProducer(object):
Compression is of full batches of data, so the efficacy of batching
will also impact the compression ratio (more batching means better
compression). Default: None.
retries (numeric): Setting a value greater than zero will cause the client
retries (int): Setting a value greater than zero will cause the client
to resend any record whose send fails with a potentially transient
error. Note that this retry is no different than if the client
resent the record upon receiving the error. Allowing retries
@@ -191,12 +131,8 @@ class KafkaProducer(object):
potentially change the ordering of records because if two batches
are sent to a single partition, and the first fails and is retried
but the second succeeds, then the records in the second batch may
appear first. Note additionally that produce requests will be
failed before the number of retries has been exhausted if the timeout
configured by delivery_timeout_ms expires first before successful
acknowledgement. Users should generally prefer to leave this config
unset and instead use delivery_timeout_ms to control retry behavior.
Default: float('inf') (infinite)
appear first.
Default: 0.
batch_size (int): Requests sent to brokers will contain multiple
batches, one for each partition with data available to be sent.
A small batch size will make batching less common and may reduce
@@ -229,6 +165,12 @@ class KafkaProducer(object):
messages with the same key are assigned to the same partition.
When a key is None, the message is delivered to a random partition
(filtered to partitions with available leaders only, if possible).
buffer_memory (int): The total bytes of memory the producer should use
to buffer records waiting to be sent to the server. If records are
sent faster than they can be delivered to the server the producer
will block up to max_block_ms, raising an exception on timeout.
In the current implementation, this setting is an approximation.
Default: 33554432 (32MB)
connections_max_idle_ms: Close idle connections after the number of
milliseconds specified by this config. The broker closes idle
connections after connections.max.idle.ms, so this avoids hitting
@@ -246,9 +188,6 @@ class KafkaProducer(object):
This setting will limit the number of record batches the producer
will send in a single request to avoid sending huge requests.
Default: 1048576.
allow_auto_create_topics (bool): Enable/disable auto topic creation
on metadata request. Only available with api_version >= (0, 11).
Default: True
metadata_max_age_ms (int): The period of time in milliseconds after
which we force a refresh of metadata even if we haven't seen any
partition leadership changes to proactively discover any new
@@ -277,7 +216,7 @@ class KafkaProducer(object):
reconnection attempts will continue periodically with this fixed
rate. To avoid connection storms, a randomization factor of 0.2
will be applied to the backoff resulting in a random range between
20% below and 20% above the computed value. Default: 30000.
20% below and 20% above the computed value. Default: 1000.
max_in_flight_requests_per_connection (int): Requests are pipelined
to kafka brokers up to this number of maximum requests per
broker connection. Note that if this setting is set to be greater
@@ -294,7 +233,7 @@ class KafkaProducer(object):
should verify that the certificate matches the brokers hostname.
default: true.
ssl_cafile (str): optional filename of ca file to use in certificate
verification. default: none.
veriication. default: none.
ssl_certfile (str): optional filename of file in pem format containing
the client certificate, as well as any ca certificates needed to
establish the certificate's authenticity. default: none.
@@ -313,28 +252,14 @@ class KafkaProducer(object):
or other configuration forbids use of all the specified ciphers),
an ssl.SSLError will be raised. See ssl.SSLContext.set_ciphers
api_version (tuple): Specify which Kafka API version to use. If set to
None, the client will attempt to determine the broker version via
ApiVersionsRequest API or, for brokers earlier than 0.10, probing
various known APIs. Dynamic version checking is performed eagerly
during __init__ and can raise NoBrokersAvailableError if no connection
was made before timeout (see api_version_auto_timeout_ms below).
Different versions enable different functionality.
Examples:
(3, 9) most recent broker release, enable all supported features
(0, 11) enables message format v2 (internal)
(0, 10, 0) enables sasl authentication and message format v1
(0, 8, 0) enables basic functionality only
Default: None
None, the client will attempt to infer the broker version by probing
various APIs. Example: (0, 10, 2). Default: None
api_version_auto_timeout_ms (int): number of milliseconds to throw a
timeout exception from the constructor when checking the broker
api version. Only applies if api_version set to None.
Default: 2000
metric_reporters (list): A list of classes to use as metrics reporters.
Implementing the AbstractMetricsReporter interface allows plugging
in classes that will be notified of new metric creation. Default: []
metrics_enabled (bool): Whether to track metrics on this instance. Default True.
metrics_num_samples (int): The number of samples maintained to compute
metrics. Default: 2
metrics_sample_window_ms (int): The maximum age in milliseconds of
@@ -349,42 +274,33 @@ class KafkaProducer(object):
Required if sasl_mechanism is PLAIN or one of the SCRAM mechanisms.
sasl_plain_password (str): password for sasl PLAIN and SCRAM authentication.
Required if sasl_mechanism is PLAIN or one of the SCRAM mechanisms.
sasl_kerberos_name (str or gssapi.Name): Constructed gssapi.Name for use with
sasl mechanism handshake. If provided, sasl_kerberos_service_name and
sasl_kerberos_domain name are ignored. Default: None.
sasl_kerberos_service_name (str): Service name to include in GSSAPI
sasl mechanism handshake. Default: 'kafka'
sasl_kerberos_domain_name (str): kerberos domain name to use in GSSAPI
sasl mechanism handshake. Default: one of bootstrap servers
sasl_oauth_token_provider (kafka.sasl.oauth.AbstractTokenProvider): OAuthBearer
token provider instance. Default: None
socks5_proxy (str): Socks5 proxy URL. Default: None
kafka_client (callable): Custom class / callable for creating KafkaClient instances
sasl_oauth_token_provider (AbstractTokenProvider): OAuthBearer token provider
instance. (See kafka.oauth.abstract). Default: None
Note:
Configuration parameters are described in more detail at
https://kafka.apache.org/0100/documentation/#producerconfigs
https://kafka.apache.org/0100/configuration.html#producerconfigs
"""
DEFAULT_CONFIG = {
'bootstrap_servers': 'localhost',
'client_id': None,
'key_serializer': None,
'value_serializer': None,
'enable_idempotence': False,
'transactional_id': None,
'transaction_timeout_ms': 60000,
'delivery_timeout_ms': 120000,
'acks': 1,
'bootstrap_topics_filter': set(),
'compression_type': None,
'retries': float('inf'),
'retries': 0,
'batch_size': 16384,
'linger_ms': 0,
'partitioner': DefaultPartitioner(),
'buffer_memory': 33554432,
'connections_max_idle_ms': 9 * 60 * 1000,
'max_block_ms': 60000,
'max_request_size': 1048576,
'allow_auto_create_topics': True,
'metadata_max_age_ms': 300000,
'retry_backoff_ms': 100,
'request_timeout_ms': 30000,
@@ -394,7 +310,7 @@ class KafkaProducer(object):
'sock_chunk_bytes': 4096, # undocumented experimental option
'sock_chunk_buffer_count': 1000, # undocumented experimental option
'reconnect_backoff_ms': 50,
'reconnect_backoff_max_ms': 30000,
'reconnect_backoff_max_ms': 1000,
'max_in_flight_requests_per_connection': 5,
'security_protocol': 'PLAINTEXT',
'ssl_context': None,
@@ -408,23 +324,17 @@ class KafkaProducer(object):
'api_version': None,
'api_version_auto_timeout_ms': 2000,
'metric_reporters': [],
'metrics_enabled': True,
'metrics_num_samples': 2,
'metrics_sample_window_ms': 30000,
'selector': selectors.DefaultSelector,
'sasl_mechanism': None,
'sasl_plain_username': None,
'sasl_plain_password': None,
'sasl_kerberos_name': None,
'sasl_kerberos_service_name': 'kafka',
'sasl_kerberos_domain_name': None,
'sasl_oauth_token_provider': None,
'socks5_proxy': None,
'kafka_client': KafkaClient,
'sasl_oauth_token_provider': None
}
DEPRECATED_CONFIGS = ('buffer_memory',)
_COMPRESSORS = {
'gzip': (has_gzip, LegacyRecordBatchBuilder.CODEC_GZIP),
'snappy': (has_snappy, LegacyRecordBatchBuilder.CODEC_SNAPPY),
@@ -434,17 +344,12 @@ class KafkaProducer(object):
}
def __init__(self, **configs):
log.debug("Starting the Kafka producer") # trace
self.config = copy.copy(self.DEFAULT_CONFIG)
user_provided_configs = set(configs.keys())
for key in self.config:
if key in configs:
self.config[key] = configs.pop(key)
for key in self.DEPRECATED_CONFIGS:
if key in configs:
configs.pop(key)
warnings.warn('Deprecated Producer config: %s' % (key,), DeprecationWarning)
# Only check for extra config keys in top-level class
assert not configs, 'Unrecognized configs: %s' % (configs,)
@@ -462,35 +367,30 @@ class KafkaProducer(object):
self.config['api_version'] = None
else:
self.config['api_version'] = tuple(map(int, deprecated.split('.')))
log.warning('%s: use api_version=%s [tuple] -- "%s" as str is deprecated',
str(self), str(self.config['api_version']), deprecated)
log.debug("%s: Starting Kafka producer", str(self))
log.warning('use api_version=%s [tuple] -- "%s" as str is deprecated',
str(self.config['api_version']), deprecated)
# Configure metrics
if self.config['metrics_enabled']:
metrics_tags = {'client-id': self.config['client_id']}
metric_config = MetricConfig(samples=self.config['metrics_num_samples'],
time_window_ms=self.config['metrics_sample_window_ms'],
tags=metrics_tags)
reporters = [reporter() for reporter in self.config['metric_reporters']]
self._metrics = Metrics(metric_config, reporters)
else:
self._metrics = None
metrics_tags = {'client-id': self.config['client_id']}
metric_config = MetricConfig(samples=self.config['metrics_num_samples'],
time_window_ms=self.config['metrics_sample_window_ms'],
tags=metrics_tags)
reporters = [reporter() for reporter in self.config['metric_reporters']]
self._metrics = Metrics(metric_config, reporters)
client = self.config['kafka_client'](
metrics=self._metrics, metric_group_prefix='producer',
wakeup_timeout_ms=self.config['max_block_ms'],
**self.config)
client = KafkaClient(metrics=self._metrics, metric_group_prefix='producer',
wakeup_timeout_ms=self.config['max_block_ms'],
**self.config)
# Get auto-discovered / normalized version from client
self.config['api_version'] = client.config['api_version']
# Get auto-discovered version from client if necessary
if self.config['api_version'] is None:
self.config['api_version'] = client.config['api_version']
if self.config['compression_type'] == 'lz4':
assert self.config['api_version'] >= (0, 8, 2), 'LZ4 Requires >= Kafka 0.8.2 Brokers'
if self.config['compression_type'] == 'zstd':
assert self.config['api_version'] >= (2, 1), 'Zstd Requires >= Kafka 2.1 Brokers'
assert self.config['api_version'] >= (2, 1, 0), 'Zstd Requires >= Kafka 2.1.0 Brokers'
# Check compression_type for library support
ct = self.config['compression_type']
@@ -501,58 +401,12 @@ class KafkaProducer(object):
assert checker(), "Libraries for {} compression codec not found".format(ct)
self.config['compression_attrs'] = compression_attrs
message_version = self._max_usable_produce_magic()
self._accumulator = RecordAccumulator(message_version=message_version, metrics=self._metrics, **self.config)
self._metadata = client.cluster
self._transaction_manager = None
self._init_transactions_result = None
if 'enable_idempotence' in user_provided_configs and not self.config['enable_idempotence'] and self.config['transactional_id']:
raise Errors.KafkaConfigurationError("Cannot set transactional_id without enable_idempotence.")
if self.config['transactional_id']:
self.config['enable_idempotence'] = True
if self.config['enable_idempotence']:
assert self.config['api_version'] >= (0, 11), "Transactional/Idempotent producer requires >= Kafka 0.11 Brokers"
self._transaction_manager = TransactionManager(
transactional_id=self.config['transactional_id'],
transaction_timeout_ms=self.config['transaction_timeout_ms'],
retry_backoff_ms=self.config['retry_backoff_ms'],
api_version=self.config['api_version'],
metadata=self._metadata,
)
if self._transaction_manager.is_transactional():
log.info("%s: Instantiated a transactional producer.", str(self))
else:
log.info("%s: Instantiated an idempotent producer.", str(self))
if self.config['retries'] == 0:
raise Errors.KafkaConfigurationError("Must set 'retries' to non-zero when using the idempotent producer.")
if 'max_in_flight_requests_per_connection' not in user_provided_configs:
log.info("%s: Overriding the default 'max_in_flight_requests_per_connection' to 1 since idempontence is enabled.", str(self))
self.config['max_in_flight_requests_per_connection'] = 1
elif self.config['max_in_flight_requests_per_connection'] != 1:
raise Errors.KafkaConfigurationError("Must set 'max_in_flight_requests_per_connection' to 1 in order"
" to use the idempotent producer."
" Otherwise we cannot guarantee idempotence.")
if 'acks' not in user_provided_configs:
log.info("%s: Overriding the default 'acks' config to 'all' since idempotence is enabled", str(self))
self.config['acks'] = -1
elif self.config['acks'] != -1:
raise Errors.KafkaConfigurationError("Must set 'acks' config to 'all' in order to use the idempotent"
" producer. Otherwise we cannot guarantee idempotence")
message_version = self.max_usable_produce_magic(self.config['api_version'])
self._accumulator = RecordAccumulator(
transaction_manager=self._transaction_manager,
message_version=message_version,
**self.config)
guarantee_message_order = bool(self.config['max_in_flight_requests_per_connection'] == 1)
self._sender = Sender(client, self._metadata,
self._accumulator,
metrics=self._metrics,
transaction_manager=self._transaction_manager,
self._accumulator, self._metrics,
guarantee_message_order=guarantee_message_order,
**self.config)
self._sender.daemon = True
@@ -561,7 +415,7 @@ class KafkaProducer(object):
self._cleanup = self._cleanup_factory()
atexit.register(self._cleanup)
log.debug("%s: Kafka producer started", str(self))
log.debug("Kafka producer started")
def bootstrap_connected(self):
"""Return True if the bootstrap is connected."""
@@ -572,7 +426,7 @@ class KafkaProducer(object):
_self = weakref.proxy(self)
def wrapper():
try:
_self.close(timeout=0, null_logger=True)
_self.close(timeout=0)
except (ReferenceError, AttributeError):
pass
return wrapper
@@ -595,28 +449,28 @@ class KafkaProducer(object):
self._cleanup = None
def __del__(self):
self.close(timeout=1, null_logger=True)
# Disable logger during destruction to avoid touching dangling references
class NullLogger(object):
def __getattr__(self, name):
return lambda *args: None
def close(self, timeout=None, null_logger=False):
global log
log = NullLogger()
self.close()
def close(self, timeout=None):
"""Close this producer.
Arguments:
timeout (float, optional): timeout in seconds to wait for completion.
"""
if null_logger:
# Disable logger during destruction to avoid touching dangling references
class NullLogger(object):
def __getattr__(self, name):
return lambda *args: None
global log
log = NullLogger()
# drop our atexit handler now to avoid leaks
self._unregister_cleanup()
if not hasattr(self, '_closed') or self._closed:
log.info('%s: Kafka producer closed', str(self))
log.info('Kafka producer closed')
return
if timeout is None:
# threading.TIMEOUT_MAX is available in Python3.3+
@@ -626,16 +480,15 @@ class KafkaProducer(object):
else:
assert timeout >= 0
log.info("%s: Closing the Kafka producer with %s secs timeout.", str(self), timeout)
self.flush(timeout)
log.info("Closing the Kafka producer with %s secs timeout.", timeout)
invoked_from_callback = bool(threading.current_thread() is self._sender)
if timeout > 0:
if invoked_from_callback:
log.warning("%s: Overriding close timeout %s secs to 0 in order to"
log.warning("Overriding close timeout %s secs to 0 in order to"
" prevent useless blocking due to self-join. This"
" means you have incorrectly invoked close with a"
" non-zero timeout from the producer call-back.",
str(self), timeout)
timeout)
else:
# Try to close gracefully.
if self._sender is not None:
@@ -643,13 +496,12 @@ class KafkaProducer(object):
self._sender.join(timeout)
if self._sender is not None and self._sender.is_alive():
log.info("%s: Proceeding to force close the producer since pending"
log.info("Proceeding to force close the producer since pending"
" requests could not be completed within timeout %s.",
str(self), timeout)
timeout)
self._sender.force_close()
if self._metrics:
self._metrics.close()
self._metrics.close()
try:
self.config['key_serializer'].close()
except AttributeError:
@@ -659,23 +511,23 @@ class KafkaProducer(object):
except AttributeError:
pass
self._closed = True
log.debug("%s: The Kafka producer has closed.", str(self))
log.debug("The Kafka producer has closed.")
def partitions_for(self, topic):
"""Returns set of all known partitions for the topic."""
return self._wait_on_metadata(topic, self.config['max_block_ms'])
max_wait = self.config['max_block_ms'] / 1000.0
return self._wait_on_metadata(topic, max_wait)
@classmethod
def max_usable_produce_magic(cls, api_version):
if api_version >= (0, 11):
def _max_usable_produce_magic(self):
if self.config['api_version'] >= (0, 11):
return 2
elif api_version >= (0, 10, 0):
elif self.config['api_version'] >= (0, 10):
return 1
else:
return 0
def _estimate_size_in_bytes(self, key, value, headers=[]):
magic = self.max_usable_produce_magic(self.config['api_version'])
magic = self._max_usable_produce_magic()
if magic == 2:
return DefaultRecordBatchBuilder.estimate_size_in_bytes(
key, value, headers)
@@ -683,114 +535,6 @@ class KafkaProducer(object):
return LegacyRecordBatchBuilder.estimate_size_in_bytes(
magic, self.config['compression_type'], key, value)
def init_transactions(self):
"""
Needs to be called before any other methods when the transactional.id is set in the configuration.
This method does the following:
1. Ensures any transactions initiated by previous instances of the producer with the same
transactional_id are completed. If the previous instance had failed with a transaction in
progress, it will be aborted. If the last transaction had begun completion,
but not yet finished, this method awaits its completion.
2. Gets the internal producer id and epoch, used in all future transactional
messages issued by the producer.
Note that this method will raise KafkaTimeoutError if the transactional state cannot
be initialized before expiration of `max_block_ms`.
Retrying after a KafkaTimeoutError will continue to wait for the prior request to succeed or fail.
Retrying after any other exception will start a new initialization attempt.
Retrying after a successful initialization will do nothing.
Raises:
IllegalStateError: if no transactional_id has been configured
AuthorizationError: fatal error indicating that the configured
transactional_id is not authorized.
KafkaError: if the producer has encountered a previous fatal error or for any other unexpected error
KafkaTimeoutError: if the time taken for initialize the transaction has surpassed `max.block.ms`.
"""
if not self._transaction_manager:
raise Errors.IllegalStateError("Cannot call init_transactions without setting a transactional_id.")
if self._init_transactions_result is None:
self._init_transactions_result = self._transaction_manager.initialize_transactions()
self._sender.wakeup()
try:
if not self._init_transactions_result.wait(timeout_ms=self.config['max_block_ms']):
raise Errors.KafkaTimeoutError("Timeout expired while initializing transactional state in %s ms." % (self.config['max_block_ms'],))
finally:
if self._init_transactions_result.failed:
self._init_transactions_result = None
def begin_transaction(self):
""" Should be called before the start of each new transaction.
Note that prior to the first invocation of this method,
you must invoke `init_transactions()` exactly one time.
Raises:
ProducerFencedError if another producer is with the same
transactional_id is active.
"""
# Set the transactional bit in the producer.
if not self._transaction_manager:
raise Errors.IllegalStateError("Cannot use transactional methods without enabling transactions")
self._transaction_manager.begin_transaction()
def send_offsets_to_transaction(self, offsets, consumer_group_id):
"""
Sends a list of consumed offsets to the consumer group coordinator, and also marks
those offsets as part of the current transaction. These offsets will be considered
consumed only if the transaction is committed successfully.
This method should be used when you need to batch consumed and produced messages
together, typically in a consume-transform-produce pattern.
Arguments:
offsets ({TopicPartition: OffsetAndMetadata}): map of topic-partition -> offsets to commit
as part of current transaction.
consumer_group_id (str): Name of consumer group for offsets commit.
Raises:
IllegalStateError: if no transactional_id, or transaction has not been started.
ProducerFencedError: fatal error indicating another producer with the same transactional_id is active.
UnsupportedVersionError: fatal error indicating the broker does not support transactions (i.e. if < 0.11).
UnsupportedForMessageFormatError: fatal error indicating the message format used for the offsets
topic on the broker does not support transactions.
AuthorizationError: fatal error indicating that the configured transactional_id is not authorized.
KafkaErro:r if the producer has encountered a previous fatal or abortable error, or for any
other unexpected error
"""
if not self._transaction_manager:
raise Errors.IllegalStateError("Cannot use transactional methods without enabling transactions")
result = self._transaction_manager.send_offsets_to_transaction(offsets, consumer_group_id)
self._sender.wakeup()
result.wait()
def commit_transaction(self):
""" Commits the ongoing transaction.
Raises: ProducerFencedError if another producer with the same
transactional_id is active.
"""
if not self._transaction_manager:
raise Errors.IllegalStateError("Cannot commit transaction since transactions are not enabled")
result = self._transaction_manager.begin_commit()
self._sender.wakeup()
result.wait()
def abort_transaction(self):
""" Aborts the ongoing transaction.
Raises: ProducerFencedError if another producer with the same
transactional_id is active.
"""
if not self._transaction_manager:
raise Errors.IllegalStateError("Cannot abort transaction since transactions are not enabled.")
result = self._transaction_manager.begin_abort()
self._sender.wakeup()
result.wait()
def send(self, topic, value=None, key=None, headers=None, partition=None, timestamp_ms=None):
"""Publish a message to a topic.
@@ -823,58 +567,44 @@ class KafkaProducer(object):
Raises:
KafkaTimeoutError: if unable to fetch topic metadata, or unable
to obtain memory buffer prior to configured max_block_ms
TypeError: if topic is not a string
ValueError: if topic is invalid: must be chars (a-zA-Z0-9._-), and less than 250 length
AssertionError: if KafkaProducer is closed, or key and value are both None
"""
assert not self._closed, 'KafkaProducer already closed!'
assert value is not None or self.config['api_version'] >= (0, 8, 1), (
'Null messages require kafka >= 0.8.1')
assert not (value is None and key is None), 'Need at least one: key or value'
ensure_valid_topic_name(topic)
key_bytes = value_bytes = None
timer = Timer(self.config['max_block_ms'], "Failed to assign partition for message in max_block_ms.")
try:
assigned_partition = None
while assigned_partition is None and not timer.expired:
self._wait_on_metadata(topic, timer.timeout_ms)
self._wait_on_metadata(topic, self.config['max_block_ms'] / 1000.0)
key_bytes = self._serialize(
self.config['key_serializer'],
topic, key)
value_bytes = self._serialize(
self.config['value_serializer'],
topic, value)
assert type(key_bytes) in (bytes, bytearray, memoryview, type(None))
assert type(value_bytes) in (bytes, bytearray, memoryview, type(None))
key_bytes = self._serialize(
self.config['key_serializer'],
topic, key)
value_bytes = self._serialize(
self.config['value_serializer'],
topic, value)
assert type(key_bytes) in (bytes, bytearray, memoryview, type(None))
assert type(value_bytes) in (bytes, bytearray, memoryview, type(None))
assigned_partition = self._partition(topic, partition, key, value,
key_bytes, value_bytes)
if assigned_partition is None:
raise Errors.KafkaTimeoutError("Failed to assign partition for message after %s secs." % timer.elapsed_ms / 1000)
else:
partition = assigned_partition
partition = self._partition(topic, partition, key, value,
key_bytes, value_bytes)
if headers is None:
headers = []
assert isinstance(headers, list)
assert all(isinstance(item, tuple) and len(item) == 2 and isinstance(item[0], str) and isinstance(item[1], bytes) for item in headers)
assert type(headers) == list
assert all(type(item) == tuple and len(item) == 2 and type(item[0]) == str and type(item[1]) == bytes for item in headers)
message_size = self._estimate_size_in_bytes(key_bytes, value_bytes, headers)
self._ensure_valid_record_size(message_size)
tp = TopicPartition(topic, partition)
log.debug("%s: Sending (key=%r value=%r headers=%r) to %s", str(self), key, value, headers, tp)
if self._transaction_manager and self._transaction_manager.is_transactional():
self._transaction_manager.maybe_add_partition_to_transaction(tp)
log.debug("Sending (key=%r value=%r headers=%r) to %s", key, value, headers, tp)
result = self._accumulator.append(tp, timestamp_ms,
key_bytes, value_bytes, headers)
key_bytes, value_bytes, headers,
self.config['max_block_ms'],
estimated_size=message_size)
future, batch_is_full, new_batch_created = result
if batch_is_full or new_batch_created:
log.debug("%s: Waking up the sender since %s is either full or"
" getting a new batch", str(self), tp)
log.debug("Waking up the sender since %s is either full or"
" getting a new batch", tp)
self._sender.wakeup()
return future
@@ -882,7 +612,7 @@ class KafkaProducer(object):
# for API exceptions return them in the future,
# for other exceptions raise directly
except Errors.BrokerResponseError as e:
log.error("%s: Exception occurred during message send: %s", str(self), e)
log.debug("Exception occurred during message send: %s", e)
return FutureRecordMetadata(
FutureProduceResult(TopicPartition(topic, partition)),
-1, None, None,
@@ -913,7 +643,7 @@ class KafkaProducer(object):
KafkaTimeoutError: failure to flush buffered records within the
provided timeout
"""
log.debug("%s: Flushing accumulated records in producer.", str(self))
log.debug("Flushing accumulated records in producer.") # trace
self._accumulator.begin_flush()
self._sender.wakeup()
self._accumulator.await_flush_completion(timeout=timeout)
@@ -925,8 +655,13 @@ class KafkaProducer(object):
"The message is %d bytes when serialized which is larger than"
" the maximum request size you have configured with the"
" max_request_size configuration" % (size,))
if size > self.config['buffer_memory']:
raise Errors.MessageSizeTooLargeError(
"The message is %d bytes when serialized which is larger than"
" the total memory buffer you have configured with the"
" buffer_memory configuration." % (size,))
def _wait_on_metadata(self, topic, max_wait_ms):
def _wait_on_metadata(self, topic, max_wait):
"""
Wait for cluster metadata including partitions for the given topic to
be available.
@@ -944,31 +679,32 @@ class KafkaProducer(object):
"""
# add topic to metadata topic list if it is not there already.
self._sender.add_topic(topic)
timer = Timer(max_wait_ms, "Failed to update metadata after %.1f secs." % (max_wait_ms / 1000,))
begin = time.time()
elapsed = 0.0
metadata_event = None
while True:
partitions = self._metadata.partitions_for_topic(topic)
if partitions is not None:
return partitions
timer.maybe_raise()
if not metadata_event:
metadata_event = threading.Event()
log.debug("%s: Requesting metadata update for topic %s", str(self), topic)
log.debug("Requesting metadata update for topic %s", topic)
metadata_event.clear()
future = self._metadata.request_update()
future.add_both(lambda e, *args: e.set(), metadata_event)
self._sender.wakeup()
metadata_event.wait(timer.timeout_ms / 1000)
if not future.is_done:
metadata_event.wait(max_wait - elapsed)
elapsed = time.time() - begin
if not metadata_event.is_set():
raise Errors.KafkaTimeoutError(
"Failed to update metadata after %.1f secs." % (max_wait_ms / 1000,))
elif future.failed() and not future.retriable():
raise future.exception
"Failed to update metadata after %.1f secs." % (max_wait,))
elif topic in self._metadata.unauthorized_topics:
raise Errors.TopicAuthorizationFailedError(set([topic]))
raise Errors.TopicAuthorizationFailedError(topic)
else:
log.debug("%s: _wait_on_metadata woke after %s secs.", str(self), timer.elapsed_ms / 1000)
log.debug("_wait_on_metadata woke after %s secs.", elapsed)
def _serialize(self, f, topic, data):
if not f:
@@ -979,18 +715,16 @@ class KafkaProducer(object):
def _partition(self, topic, partition, key, value,
serialized_key, serialized_value):
all_partitions = self._metadata.partitions_for_topic(topic)
available = self._metadata.available_partitions_for_topic(topic)
if all_partitions is None or available is None:
return None
if partition is not None:
assert partition >= 0
assert partition in all_partitions, 'Unrecognized partition'
assert partition in self._metadata.partitions_for_topic(topic), 'Unrecognized partition'
return partition
all_partitions = sorted(self._metadata.partitions_for_topic(topic))
available = list(self._metadata.available_partitions_for_topic(topic))
return self.config['partitioner'](serialized_key,
sorted(all_partitions),
list(available))
all_partitions,
available)
def metrics(self, raw=False):
"""Get metrics on producer performance.
@@ -1002,8 +736,6 @@ class KafkaProducer(object):
This is an unstable interface. It may change in future
releases without warning.
"""
if not self._metrics:
return
if raw:
return self._metrics.metrics.copy()
@@ -1015,6 +747,3 @@ class KafkaProducer(object):
metrics[k.group][k.name] = {}
metrics[k.group][k.name] = v.value()
return metrics
def __str__(self):
return "<KafkaProducer client_id=%s transactional_id=%s>" % (self.config['client_id'], self.config['transactional_id'])