All checks were successful
continuous-integration/drone/push Build is passing
764 lines
37 KiB
Python
764 lines
37 KiB
Python
from __future__ import absolute_import, division
|
|
|
|
import collections
|
|
import copy
|
|
import heapq
|
|
import logging
|
|
import threading
|
|
import time
|
|
|
|
from kafka.vendor import six
|
|
|
|
from kafka import errors as Errors
|
|
from kafka.metrics.measurable import AnonMeasurable
|
|
from kafka.metrics.stats import Avg, Max, Rate
|
|
from kafka.producer.transaction_manager import ProducerIdAndEpoch
|
|
from kafka.protocol.init_producer_id import InitProducerIdRequest
|
|
from kafka.protocol.produce import ProduceRequest
|
|
from kafka.structs import TopicPartition
|
|
from kafka.version import __version__
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
class Sender(threading.Thread):
|
|
"""
|
|
The background thread that handles the sending of produce requests to the
|
|
Kafka cluster. This thread makes metadata requests to renew its view of the
|
|
cluster and then sends produce requests to the appropriate nodes.
|
|
"""
|
|
DEFAULT_CONFIG = {
|
|
'max_request_size': 1048576,
|
|
'acks': 1,
|
|
'retries': float('inf'),
|
|
'request_timeout_ms': 30000,
|
|
'retry_backoff_ms': 100,
|
|
'metrics': None,
|
|
'guarantee_message_order': False,
|
|
'transaction_manager': None,
|
|
'transactional_id': None,
|
|
'transaction_timeout_ms': 60000,
|
|
'client_id': 'kafka-python-' + __version__,
|
|
}
|
|
|
|
def __init__(self, client, metadata, accumulator, **configs):
|
|
super(Sender, self).__init__()
|
|
self.config = copy.copy(self.DEFAULT_CONFIG)
|
|
for key in self.config:
|
|
if key in configs:
|
|
self.config[key] = configs.pop(key)
|
|
|
|
self.name = self.config['client_id'] + '-network-thread'
|
|
self._client = client
|
|
self._accumulator = accumulator
|
|
self._metadata = client.cluster
|
|
self._running = True
|
|
self._force_close = False
|
|
self._topics_to_add = set()
|
|
if self.config['metrics']:
|
|
self._sensors = SenderMetrics(self.config['metrics'], self._client, self._metadata)
|
|
else:
|
|
self._sensors = None
|
|
self._transaction_manager = self.config['transaction_manager']
|
|
# A per-partition queue of batches ordered by creation time for tracking the in-flight batches
|
|
self._in_flight_batches = collections.defaultdict(list)
|
|
|
|
def _maybe_remove_from_inflight_batches(self, batch):
|
|
try:
|
|
queue = self._in_flight_batches[batch.topic_partition]
|
|
except KeyError:
|
|
return
|
|
try:
|
|
idx = queue.index((batch.created, batch))
|
|
except ValueError:
|
|
return
|
|
# https://stackoverflow.com/questions/10162679/python-delete-element-from-heap
|
|
queue[idx] = queue[-1]
|
|
queue.pop()
|
|
heapq.heapify(queue)
|
|
|
|
def _get_expired_inflight_batches(self, now=None):
|
|
"""Get the in-flight batches that has reached delivery timeout."""
|
|
expired_batches = []
|
|
to_remove = []
|
|
for tp, queue in six.iteritems(self._in_flight_batches):
|
|
while queue:
|
|
_created_at, batch = queue[0]
|
|
if batch.has_reached_delivery_timeout(self._accumulator.delivery_timeout_ms):
|
|
heapq.heappop(queue)
|
|
if batch.final_state is None:
|
|
expired_batches.append(batch)
|
|
else:
|
|
raise Errors.IllegalStateError("%s batch created at %s gets unexpected final state %s" % (batch.topic_partition, batch.created, batch.final_state))
|
|
else:
|
|
self._accumulator.maybe_update_next_batch_expiry_time(batch)
|
|
break
|
|
else:
|
|
# Avoid mutating in_flight_batches during iteration
|
|
to_remove.append(tp)
|
|
for tp in to_remove:
|
|
del self._in_flight_batches[tp]
|
|
return expired_batches
|
|
|
|
def run(self):
|
|
"""The main run loop for the sender thread."""
|
|
log.debug("%s: Starting Kafka producer I/O thread.", str(self))
|
|
|
|
# main loop, runs until close is called
|
|
while self._running:
|
|
try:
|
|
self.run_once()
|
|
except Exception:
|
|
log.exception("%s: Uncaught error in kafka producer I/O thread", str(self))
|
|
|
|
log.debug("%s: Beginning shutdown of Kafka producer I/O thread, sending"
|
|
" remaining records.", str(self))
|
|
|
|
# okay we stopped accepting requests but there may still be
|
|
# requests in the accumulator or waiting for acknowledgment,
|
|
# wait until these are completed.
|
|
while (not self._force_close
|
|
and (self._accumulator.has_undrained()
|
|
or self._client.in_flight_request_count() > 0)):
|
|
try:
|
|
self.run_once()
|
|
except Exception:
|
|
log.exception("%s: Uncaught error in kafka producer I/O thread", str(self))
|
|
|
|
if self._force_close:
|
|
# We need to fail all the incomplete batches and wake up the
|
|
# threads waiting on the futures.
|
|
self._accumulator.abort_incomplete_batches()
|
|
|
|
try:
|
|
self._client.close()
|
|
except Exception:
|
|
log.exception("%s: Failed to close network client", str(self))
|
|
|
|
log.debug("%s: Shutdown of Kafka producer I/O thread has completed.", str(self))
|
|
|
|
def run_once(self):
|
|
"""Run a single iteration of sending."""
|
|
while self._topics_to_add:
|
|
self._client.add_topic(self._topics_to_add.pop())
|
|
|
|
if self._transaction_manager:
|
|
try:
|
|
if not self._transaction_manager.is_transactional():
|
|
# this is an idempotent producer, so make sure we have a producer id
|
|
self._maybe_wait_for_producer_id()
|
|
elif self._transaction_manager.has_in_flight_transactional_request() or self._maybe_send_transactional_request():
|
|
# as long as there are outstanding transactional requests, we simply wait for them to return
|
|
self._client.poll(timeout_ms=self.config['retry_backoff_ms'])
|
|
return
|
|
|
|
# do not continue sending if the transaction manager is in a failed state or if there
|
|
# is no producer id (for the idempotent case).
|
|
if self._transaction_manager.has_fatal_error() or not self._transaction_manager.has_producer_id():
|
|
last_error = self._transaction_manager.last_error
|
|
if last_error is not None:
|
|
self._maybe_abort_batches(last_error)
|
|
self._client.poll(timeout_ms=self.config['retry_backoff_ms'])
|
|
return
|
|
elif self._transaction_manager.has_abortable_error():
|
|
self._accumulator.abort_undrained_batches(self._transaction_manager.last_error)
|
|
|
|
except Errors.SaslAuthenticationFailedError as e:
|
|
# This is already logged as error, but propagated here to perform any clean ups.
|
|
log.debug("%s: Authentication exception while processing transactional request: %s", str(self), e)
|
|
self._transaction_manager.authentication_failed(e)
|
|
|
|
poll_timeout_ms = self._send_producer_data()
|
|
self._client.poll(timeout_ms=poll_timeout_ms)
|
|
|
|
def _send_producer_data(self, now=None):
|
|
now = time.time() if now is None else now
|
|
# get the list of partitions with data ready to send
|
|
result = self._accumulator.ready(self._metadata, now=now)
|
|
ready_nodes, next_ready_check_delay, unknown_leaders_exist = result
|
|
|
|
# if there are any partitions whose leaders are not known yet, force
|
|
# metadata update
|
|
if unknown_leaders_exist:
|
|
log.debug('%s: Unknown leaders exist, requesting metadata update', str(self))
|
|
self._metadata.request_update()
|
|
|
|
# remove any nodes we aren't ready to send to
|
|
not_ready_timeout_ms = float('inf')
|
|
for node in list(ready_nodes):
|
|
if not self._client.is_ready(node):
|
|
node_delay_ms = self._client.connection_delay(node)
|
|
log.debug('%s: Node %s not ready; delaying produce of accumulated batch (%f ms)', str(self), node, node_delay_ms)
|
|
self._client.maybe_connect(node, wakeup=False)
|
|
ready_nodes.remove(node)
|
|
not_ready_timeout_ms = min(not_ready_timeout_ms, node_delay_ms)
|
|
|
|
# create produce requests
|
|
batches_by_node = self._accumulator.drain(
|
|
self._metadata, ready_nodes, self.config['max_request_size'], now=now)
|
|
|
|
for batch_list in six.itervalues(batches_by_node):
|
|
for batch in batch_list:
|
|
item = (batch.created, batch)
|
|
queue = self._in_flight_batches[batch.topic_partition]
|
|
heapq.heappush(queue, item)
|
|
|
|
if self.config['guarantee_message_order']:
|
|
# Mute all the partitions drained
|
|
for batch_list in six.itervalues(batches_by_node):
|
|
for batch in batch_list:
|
|
self._accumulator.muted.add(batch.topic_partition)
|
|
|
|
self._accumulator.reset_next_batch_expiry_time()
|
|
expired_batches = self._accumulator.expired_batches(now=now)
|
|
expired_batches.extend(self._get_expired_inflight_batches(now=now))
|
|
|
|
if expired_batches:
|
|
log.debug("%s: Expired %s batches in accumulator", str(self), len(expired_batches))
|
|
|
|
# Reset the producer_id if an expired batch has previously been sent to the broker.
|
|
# See the documentation of `TransactionState.reset_producer_id` to understand why
|
|
# we need to reset the producer id here.
|
|
if self._transaction_manager and any([batch.in_retry() for batch in expired_batches]):
|
|
needs_transaction_state_reset = True
|
|
else:
|
|
needs_transaction_state_reset = False
|
|
|
|
for expired_batch in expired_batches:
|
|
error = Errors.KafkaTimeoutError(
|
|
"Expiring %d record(s) for %s: %s ms has passed since batch creation" % (
|
|
expired_batch.record_count, expired_batch.topic_partition,
|
|
int((time.time() - expired_batch.created) * 1000)))
|
|
self._fail_batch(expired_batch, error, base_offset=-1)
|
|
|
|
if self._sensors:
|
|
self._sensors.update_produce_request_metrics(batches_by_node)
|
|
|
|
if needs_transaction_state_reset:
|
|
self._transaction_manager.reset_producer_id()
|
|
return 0
|
|
|
|
requests = self._create_produce_requests(batches_by_node)
|
|
# If we have any nodes that are ready to send + have sendable data,
|
|
# poll with 0 timeout so this can immediately loop and try sending more
|
|
# data. Otherwise, the timeout will be the smaller value between next
|
|
# batch expiry time, and the delay time for checking data availability.
|
|
# Note that the nodes may have data that isn't yet sendable due to
|
|
# lingering, backing off, etc. This specifically does not include nodes with
|
|
# sendable data that aren't ready to send since they would cause busy
|
|
# looping.
|
|
poll_timeout_ms = min(next_ready_check_delay * 1000,
|
|
not_ready_timeout_ms,
|
|
self._accumulator.next_expiry_time_ms - now * 1000)
|
|
if poll_timeout_ms < 0:
|
|
poll_timeout_ms = 0
|
|
|
|
if ready_nodes:
|
|
log.debug("%s: Nodes with data ready to send: %s", str(self), ready_nodes) # trace
|
|
log.debug("%s: Created %d produce requests: %s", str(self), len(requests), requests) # trace
|
|
# if some partitions are already ready to be sent, the select time
|
|
# would be 0; otherwise if some partition already has some data
|
|
# accumulated but not ready yet, the select time will be the time
|
|
# difference between now and its linger expiry time; otherwise the
|
|
# select time will be the time difference between now and the
|
|
# metadata expiry time
|
|
poll_timeout_ms = 0
|
|
|
|
for node_id, request in six.iteritems(requests):
|
|
batches = batches_by_node[node_id]
|
|
log.debug('%s: Sending Produce Request: %r', str(self), request)
|
|
(self._client.send(node_id, request, wakeup=False)
|
|
.add_callback(
|
|
self._handle_produce_response, node_id, time.time(), batches)
|
|
.add_errback(
|
|
self._failed_produce, batches, node_id))
|
|
return poll_timeout_ms
|
|
|
|
def _maybe_send_transactional_request(self):
|
|
if self._transaction_manager.is_completing() and self._accumulator.has_incomplete:
|
|
if self._transaction_manager.is_aborting():
|
|
self._accumulator.abort_undrained_batches(Errors.KafkaError("Failing batch since transaction was aborted"))
|
|
# There may still be requests left which are being retried. Since we do not know whether they had
|
|
# been successfully appended to the broker log, we must resend them until their final status is clear.
|
|
# If they had been appended and we did not receive the error, then our sequence number would no longer
|
|
# be correct which would lead to an OutOfSequenceNumberError.
|
|
if not self._accumulator.flush_in_progress():
|
|
self._accumulator.begin_flush()
|
|
|
|
next_request_handler = self._transaction_manager.next_request_handler(self._accumulator.has_incomplete)
|
|
if next_request_handler is None:
|
|
return False
|
|
|
|
log.debug("%s: Sending transactional request %s", str(self), next_request_handler.request)
|
|
while not self._force_close:
|
|
target_node = None
|
|
try:
|
|
if next_request_handler.needs_coordinator():
|
|
target_node = self._transaction_manager.coordinator(next_request_handler.coordinator_type)
|
|
if target_node is None:
|
|
self._transaction_manager.lookup_coordinator_for_request(next_request_handler)
|
|
break
|
|
elif not self._client.await_ready(target_node, timeout_ms=self.config['request_timeout_ms']):
|
|
self._transaction_manager.lookup_coordinator_for_request(next_request_handler)
|
|
target_node = None
|
|
break
|
|
else:
|
|
target_node = self._client.least_loaded_node()
|
|
if target_node is not None and not self._client.await_ready(target_node, timeout_ms=self.config['request_timeout_ms']):
|
|
target_node = None
|
|
|
|
if target_node is not None:
|
|
if next_request_handler.is_retry:
|
|
time.sleep(self.config['retry_backoff_ms'] / 1000)
|
|
txn_correlation_id = self._transaction_manager.next_in_flight_request_correlation_id()
|
|
future = self._client.send(target_node, next_request_handler.request)
|
|
future.add_both(next_request_handler.on_complete, txn_correlation_id)
|
|
return True
|
|
|
|
except Exception as e:
|
|
log.warn("%s: Got an exception when trying to find a node to send a transactional request to. Going to back off and retry: %s", str(self), e)
|
|
if next_request_handler.needs_coordinator():
|
|
self._transaction_manager.lookup_coordinator_for_request(next_request_handler)
|
|
break
|
|
|
|
time.sleep(self.config['retry_backoff_ms'] / 1000)
|
|
self._metadata.request_update()
|
|
|
|
if target_node is None:
|
|
self._transaction_manager.retry(next_request_handler)
|
|
|
|
return True
|
|
|
|
def _maybe_abort_batches(self, exc):
|
|
if self._accumulator.has_incomplete:
|
|
log.error("%s: Aborting producer batches due to fatal error: %s", str(self), exc)
|
|
self._accumulator.abort_batches(exc)
|
|
|
|
def initiate_close(self):
|
|
"""Start closing the sender (won't complete until all data is sent)."""
|
|
self._running = False
|
|
self._accumulator.close()
|
|
self.wakeup()
|
|
|
|
def force_close(self):
|
|
"""Closes the sender without sending out any pending messages."""
|
|
self._force_close = True
|
|
self.initiate_close()
|
|
|
|
def add_topic(self, topic):
|
|
# This is generally called from a separate thread
|
|
# so this needs to be a thread-safe operation
|
|
# we assume that checking set membership across threads
|
|
# is ok where self._client._topics should never
|
|
# remove topics for a producer instance, only add them.
|
|
if topic not in self._client._topics:
|
|
self._topics_to_add.add(topic)
|
|
self.wakeup()
|
|
|
|
def _maybe_wait_for_producer_id(self):
|
|
while not self._transaction_manager.has_producer_id():
|
|
try:
|
|
node_id = self._client.least_loaded_node()
|
|
if node_id is None or not self._client.await_ready(node_id):
|
|
log.debug("%s, Could not find an available broker to send InitProducerIdRequest to." +
|
|
" Will back off and try again.", str(self))
|
|
time.sleep(self._client.least_loaded_node_refresh_ms() / 1000)
|
|
continue
|
|
version = self._client.api_version(InitProducerIdRequest, max_version=1)
|
|
request = InitProducerIdRequest[version](
|
|
transactional_id=self.config['transactional_id'],
|
|
transaction_timeout_ms=self.config['transaction_timeout_ms'],
|
|
)
|
|
response = self._client.send_and_receive(node_id, request)
|
|
error_type = Errors.for_code(response.error_code)
|
|
if error_type is Errors.NoError:
|
|
self._transaction_manager.set_producer_id_and_epoch(ProducerIdAndEpoch(response.producer_id, response.producer_epoch))
|
|
break
|
|
elif getattr(error_type, 'retriable', False):
|
|
log.debug("%s: Retriable error from InitProducerId response: %s", str(self), error_type.__name__)
|
|
if getattr(error_type, 'invalid_metadata', False):
|
|
self._metadata.request_update()
|
|
else:
|
|
self._transaction_manager.transition_to_fatal_error(error_type())
|
|
break
|
|
except Errors.KafkaConnectionError:
|
|
log.debug("%s: Broker %s disconnected while awaiting InitProducerId response", str(self), node_id)
|
|
except Errors.RequestTimedOutError:
|
|
log.debug("%s: InitProducerId request to node %s timed out", str(self), node_id)
|
|
log.debug("%s: Retry InitProducerIdRequest in %sms.", str(self), self.config['retry_backoff_ms'])
|
|
time.sleep(self.config['retry_backoff_ms'] / 1000)
|
|
|
|
def _failed_produce(self, batches, node_id, error):
|
|
log.error("%s: Error sending produce request to node %d: %s", str(self), node_id, error) # trace
|
|
for batch in batches:
|
|
self._complete_batch(batch, error, -1)
|
|
|
|
def _handle_produce_response(self, node_id, send_time, batches, response):
|
|
"""Handle a produce response."""
|
|
# if we have a response, parse it
|
|
log.debug('%s: Parsing produce response: %r', str(self), response)
|
|
if response:
|
|
batches_by_partition = dict([(batch.topic_partition, batch)
|
|
for batch in batches])
|
|
|
|
for topic, partitions in response.topics:
|
|
for partition_info in partitions:
|
|
if response.API_VERSION < 2:
|
|
partition, error_code, offset = partition_info
|
|
ts = None
|
|
elif 2 <= response.API_VERSION <= 4:
|
|
partition, error_code, offset, ts = partition_info
|
|
elif 5 <= response.API_VERSION <= 7:
|
|
partition, error_code, offset, ts, _log_start_offset = partition_info
|
|
else:
|
|
# Currently unused / TODO: KIP-467
|
|
partition, error_code, offset, ts, _log_start_offset, _record_errors, _global_error = partition_info
|
|
tp = TopicPartition(topic, partition)
|
|
error = Errors.for_code(error_code)
|
|
batch = batches_by_partition[tp]
|
|
self._complete_batch(batch, error, offset, timestamp_ms=ts)
|
|
|
|
else:
|
|
# this is the acks = 0 case, just complete all requests
|
|
for batch in batches:
|
|
self._complete_batch(batch, None, -1)
|
|
|
|
def _fail_batch(self, batch, exception, base_offset=None, timestamp_ms=None):
|
|
exception = exception if type(exception) is not type else exception()
|
|
if self._transaction_manager:
|
|
if isinstance(exception, Errors.OutOfOrderSequenceNumberError) and \
|
|
not self._transaction_manager.is_transactional() and \
|
|
self._transaction_manager.has_producer_id(batch.producer_id):
|
|
log.error("%s: The broker received an out of order sequence number for topic-partition %s"
|
|
" at offset %s. This indicates data loss on the broker, and should be investigated.",
|
|
str(self), batch.topic_partition, base_offset)
|
|
|
|
# Reset the transaction state since we have hit an irrecoverable exception and cannot make any guarantees
|
|
# about the previously committed message. Note that this will discard the producer id and sequence
|
|
# numbers for all existing partitions.
|
|
self._transaction_manager.reset_producer_id()
|
|
elif isinstance(exception, (Errors.ClusterAuthorizationFailedError,
|
|
Errors.TransactionalIdAuthorizationFailedError,
|
|
Errors.ProducerFencedError,
|
|
Errors.InvalidTxnStateError)):
|
|
self._transaction_manager.transition_to_fatal_error(exception)
|
|
elif self._transaction_manager.is_transactional():
|
|
self._transaction_manager.transition_to_abortable_error(exception)
|
|
|
|
if self._sensors:
|
|
self._sensors.record_errors(batch.topic_partition.topic, batch.record_count)
|
|
|
|
if batch.done(base_offset=base_offset, timestamp_ms=timestamp_ms, exception=exception):
|
|
self._maybe_remove_from_inflight_batches(batch)
|
|
self._accumulator.deallocate(batch)
|
|
|
|
def _complete_batch(self, batch, error, base_offset, timestamp_ms=None):
|
|
"""Complete or retry the given batch of records.
|
|
|
|
Arguments:
|
|
batch (ProducerBatch): The record batch
|
|
error (Exception): The error (or None if none)
|
|
base_offset (int): The base offset assigned to the records if successful
|
|
timestamp_ms (int, optional): The timestamp returned by the broker for this batch
|
|
"""
|
|
# Standardize no-error to None
|
|
if error is Errors.NoError:
|
|
error = None
|
|
|
|
if error is not None:
|
|
if self._can_retry(batch, error):
|
|
# retry
|
|
log.warning("%s: Got error produce response on topic-partition %s,"
|
|
" retrying (%s attempts left). Error: %s",
|
|
str(self), batch.topic_partition,
|
|
self.config['retries'] - batch.attempts - 1,
|
|
error)
|
|
|
|
# If idempotence is enabled only retry the request if the batch matches our current producer id and epoch
|
|
if not self._transaction_manager or self._transaction_manager.producer_id_and_epoch.match(batch):
|
|
log.debug("%s: Retrying batch to topic-partition %s. Sequence number: %s",
|
|
str(self), batch.topic_partition,
|
|
self._transaction_manager.sequence_number(batch.topic_partition) if self._transaction_manager else None)
|
|
self._accumulator.reenqueue(batch)
|
|
self._maybe_remove_from_inflight_batches(batch)
|
|
if self._sensors:
|
|
self._sensors.record_retries(batch.topic_partition.topic, batch.record_count)
|
|
else:
|
|
log.warning("%s: Attempted to retry sending a batch but the producer id/epoch changed from %s/%s to %s/%s. This batch will be dropped",
|
|
str(self), batch.producer_id, batch.producer_epoch,
|
|
self._transaction_manager.producer_id_and_epoch.producer_id,
|
|
self._transaction_manager.producer_id_and_epoch.epoch)
|
|
self._fail_batch(batch, error, base_offset=base_offset, timestamp_ms=timestamp_ms)
|
|
else:
|
|
if error is Errors.TopicAuthorizationFailedError:
|
|
error = error(batch.topic_partition.topic)
|
|
|
|
# tell the user the result of their request
|
|
self._fail_batch(batch, error, base_offset=base_offset, timestamp_ms=timestamp_ms)
|
|
|
|
if error is Errors.UnknownTopicOrPartitionError:
|
|
log.warning("%s: Received unknown topic or partition error in produce request on partition %s."
|
|
" The topic/partition may not exist or the user may not have Describe access to it",
|
|
str(self), batch.topic_partition)
|
|
|
|
if getattr(error, 'invalid_metadata', False):
|
|
self._metadata.request_update()
|
|
|
|
else:
|
|
if batch.done(base_offset=base_offset, timestamp_ms=timestamp_ms):
|
|
self._maybe_remove_from_inflight_batches(batch)
|
|
self._accumulator.deallocate(batch)
|
|
|
|
if self._transaction_manager and self._transaction_manager.producer_id_and_epoch.match(batch):
|
|
self._transaction_manager.increment_sequence_number(batch.topic_partition, batch.record_count)
|
|
log.debug("%s: Incremented sequence number for topic-partition %s to %s", str(self), batch.topic_partition,
|
|
self._transaction_manager.sequence_number(batch.topic_partition))
|
|
|
|
# Unmute the completed partition.
|
|
if self.config['guarantee_message_order']:
|
|
self._accumulator.muted.remove(batch.topic_partition)
|
|
|
|
def _can_retry(self, batch, error):
|
|
"""
|
|
We can retry a send if the error is transient and the number of
|
|
attempts taken is fewer than the maximum allowed
|
|
"""
|
|
return (not batch.has_reached_delivery_timeout(self._accumulator.delivery_timeout_ms) and
|
|
batch.attempts < self.config['retries'] and
|
|
batch.final_state is None and
|
|
getattr(error, 'retriable', False))
|
|
|
|
def _create_produce_requests(self, collated):
|
|
"""
|
|
Transfer the record batches into a list of produce requests on a
|
|
per-node basis.
|
|
|
|
Arguments:
|
|
collated: {node_id: [ProducerBatch]}
|
|
|
|
Returns:
|
|
dict: {node_id: ProduceRequest} (version depends on client api_versions)
|
|
"""
|
|
requests = {}
|
|
for node_id, batches in six.iteritems(collated):
|
|
if batches:
|
|
requests[node_id] = self._produce_request(
|
|
node_id, self.config['acks'],
|
|
self.config['request_timeout_ms'], batches)
|
|
return requests
|
|
|
|
def _produce_request(self, node_id, acks, timeout, batches):
|
|
"""Create a produce request from the given record batches.
|
|
|
|
Returns:
|
|
ProduceRequest (version depends on client api_versions)
|
|
"""
|
|
produce_records_by_partition = collections.defaultdict(dict)
|
|
for batch in batches:
|
|
topic = batch.topic_partition.topic
|
|
partition = batch.topic_partition.partition
|
|
|
|
buf = batch.records.buffer()
|
|
produce_records_by_partition[topic][partition] = buf
|
|
|
|
version = self._client.api_version(ProduceRequest, max_version=7)
|
|
topic_partition_data = [
|
|
(topic, list(partition_info.items()))
|
|
for topic, partition_info in six.iteritems(produce_records_by_partition)]
|
|
transactional_id = self._transaction_manager.transactional_id if self._transaction_manager else None
|
|
if version >= 3:
|
|
return ProduceRequest[version](
|
|
transactional_id=transactional_id,
|
|
required_acks=acks,
|
|
timeout=timeout,
|
|
topics=topic_partition_data,
|
|
)
|
|
else:
|
|
if transactional_id is not None:
|
|
log.warning('%s: Broker does not support ProduceRequest v3+, required for transactional_id', str(self))
|
|
return ProduceRequest[version](
|
|
required_acks=acks,
|
|
timeout=timeout,
|
|
topics=topic_partition_data,
|
|
)
|
|
|
|
def wakeup(self):
|
|
"""Wake up the selector associated with this send thread."""
|
|
self._client.wakeup()
|
|
|
|
def bootstrap_connected(self):
|
|
return self._client.bootstrap_connected()
|
|
|
|
def __str__(self):
|
|
return "<Sender client_id=%s transactional_id=%s>" % (self.config['client_id'], self.config['transactional_id'])
|
|
|
|
|
|
class SenderMetrics(object):
|
|
|
|
def __init__(self, metrics, client, metadata):
|
|
self.metrics = metrics
|
|
self._client = client
|
|
self._metadata = metadata
|
|
|
|
sensor_name = 'batch-size'
|
|
self.batch_size_sensor = self.metrics.sensor(sensor_name)
|
|
self.add_metric('batch-size-avg', Avg(),
|
|
sensor_name=sensor_name,
|
|
description='The average number of bytes sent per partition per-request.')
|
|
self.add_metric('batch-size-max', Max(),
|
|
sensor_name=sensor_name,
|
|
description='The max number of bytes sent per partition per-request.')
|
|
|
|
sensor_name = 'compression-rate'
|
|
self.compression_rate_sensor = self.metrics.sensor(sensor_name)
|
|
self.add_metric('compression-rate-avg', Avg(),
|
|
sensor_name=sensor_name,
|
|
description='The average compression rate of record batches.')
|
|
|
|
sensor_name = 'queue-time'
|
|
self.queue_time_sensor = self.metrics.sensor(sensor_name)
|
|
self.add_metric('record-queue-time-avg', Avg(),
|
|
sensor_name=sensor_name,
|
|
description='The average time in ms record batches spent in the record accumulator.')
|
|
self.add_metric('record-queue-time-max', Max(),
|
|
sensor_name=sensor_name,
|
|
description='The maximum time in ms record batches spent in the record accumulator.')
|
|
|
|
sensor_name = 'records-per-request'
|
|
self.records_per_request_sensor = self.metrics.sensor(sensor_name)
|
|
self.add_metric('record-send-rate', Rate(),
|
|
sensor_name=sensor_name,
|
|
description='The average number of records sent per second.')
|
|
self.add_metric('records-per-request-avg', Avg(),
|
|
sensor_name=sensor_name,
|
|
description='The average number of records per request.')
|
|
|
|
sensor_name = 'bytes'
|
|
self.byte_rate_sensor = self.metrics.sensor(sensor_name)
|
|
self.add_metric('byte-rate', Rate(),
|
|
sensor_name=sensor_name,
|
|
description='The average number of bytes sent per second.')
|
|
|
|
sensor_name = 'record-retries'
|
|
self.retry_sensor = self.metrics.sensor(sensor_name)
|
|
self.add_metric('record-retry-rate', Rate(),
|
|
sensor_name=sensor_name,
|
|
description='The average per-second number of retried record sends')
|
|
|
|
sensor_name = 'errors'
|
|
self.error_sensor = self.metrics.sensor(sensor_name)
|
|
self.add_metric('record-error-rate', Rate(),
|
|
sensor_name=sensor_name,
|
|
description='The average per-second number of record sends that resulted in errors')
|
|
|
|
sensor_name = 'record-size-max'
|
|
self.max_record_size_sensor = self.metrics.sensor(sensor_name)
|
|
self.add_metric('record-size-max', Max(),
|
|
sensor_name=sensor_name,
|
|
description='The maximum record size across all batches')
|
|
self.add_metric('record-size-avg', Avg(),
|
|
sensor_name=sensor_name,
|
|
description='The average maximum record size per batch')
|
|
|
|
self.add_metric('requests-in-flight',
|
|
AnonMeasurable(lambda *_: self._client.in_flight_request_count()),
|
|
description='The current number of in-flight requests awaiting a response.')
|
|
|
|
self.add_metric('metadata-age',
|
|
AnonMeasurable(lambda _, now: (now - self._metadata._last_successful_refresh_ms) / 1000),
|
|
description='The age in seconds of the current producer metadata being used.')
|
|
|
|
def add_metric(self, metric_name, measurable, group_name='producer-metrics',
|
|
description=None, tags=None,
|
|
sensor_name=None):
|
|
m = self.metrics
|
|
metric = m.metric_name(metric_name, group_name, description, tags)
|
|
if sensor_name:
|
|
sensor = m.sensor(sensor_name)
|
|
sensor.add(metric, measurable)
|
|
else:
|
|
m.add_metric(metric, measurable)
|
|
|
|
def maybe_register_topic_metrics(self, topic):
|
|
|
|
def sensor_name(name):
|
|
return 'topic.{0}.{1}'.format(topic, name)
|
|
|
|
# if one sensor of the metrics has been registered for the topic,
|
|
# then all other sensors should have been registered; and vice versa
|
|
if not self.metrics.get_sensor(sensor_name('records-per-batch')):
|
|
|
|
self.add_metric('record-send-rate', Rate(),
|
|
sensor_name=sensor_name('records-per-batch'),
|
|
group_name='producer-topic-metrics.' + topic,
|
|
description= 'Records sent per second for topic ' + topic)
|
|
|
|
self.add_metric('byte-rate', Rate(),
|
|
sensor_name=sensor_name('bytes'),
|
|
group_name='producer-topic-metrics.' + topic,
|
|
description='Bytes per second for topic ' + topic)
|
|
|
|
self.add_metric('compression-rate', Avg(),
|
|
sensor_name=sensor_name('compression-rate'),
|
|
group_name='producer-topic-metrics.' + topic,
|
|
description='Average Compression ratio for topic ' + topic)
|
|
|
|
self.add_metric('record-retry-rate', Rate(),
|
|
sensor_name=sensor_name('record-retries'),
|
|
group_name='producer-topic-metrics.' + topic,
|
|
description='Record retries per second for topic ' + topic)
|
|
|
|
self.add_metric('record-error-rate', Rate(),
|
|
sensor_name=sensor_name('record-errors'),
|
|
group_name='producer-topic-metrics.' + topic,
|
|
description='Record errors per second for topic ' + topic)
|
|
|
|
def update_produce_request_metrics(self, batches_map):
|
|
for node_batch in batches_map.values():
|
|
records = 0
|
|
total_bytes = 0
|
|
for batch in node_batch:
|
|
# register all per-topic metrics at once
|
|
topic = batch.topic_partition.topic
|
|
self.maybe_register_topic_metrics(topic)
|
|
|
|
# per-topic record send rate
|
|
topic_records_count = self.metrics.get_sensor(
|
|
'topic.' + topic + '.records-per-batch')
|
|
topic_records_count.record(batch.record_count)
|
|
|
|
# per-topic bytes send rate
|
|
topic_byte_rate = self.metrics.get_sensor(
|
|
'topic.' + topic + '.bytes')
|
|
topic_byte_rate.record(batch.records.size_in_bytes())
|
|
|
|
# per-topic compression rate
|
|
topic_compression_rate = self.metrics.get_sensor(
|
|
'topic.' + topic + '.compression-rate')
|
|
topic_compression_rate.record(batch.records.compression_rate())
|
|
|
|
# global metrics
|
|
self.batch_size_sensor.record(batch.records.size_in_bytes())
|
|
if batch.drained:
|
|
self.queue_time_sensor.record(batch.drained - batch.created)
|
|
self.compression_rate_sensor.record(batch.records.compression_rate())
|
|
self.max_record_size_sensor.record(batch.max_record_size)
|
|
records += batch.record_count
|
|
total_bytes += batch.records.size_in_bytes()
|
|
|
|
if node_batch:
|
|
self.records_per_request_sensor.record(records)
|
|
self.byte_rate_sensor.record(total_bytes)
|
|
|
|
def record_retries(self, topic, count):
|
|
self.retry_sensor.record(count)
|
|
sensor = self.metrics.get_sensor('topic.' + topic + '.record-retries')
|
|
if sensor:
|
|
sensor.record(count)
|
|
|
|
def record_errors(self, topic, count):
|
|
self.error_sensor.record(count)
|
|
sensor = self.metrics.get_sensor('topic.' + topic + '.record-errors')
|
|
if sensor:
|
|
sensor.record(count)
|