API refactor

2025-10-07 16:25:52 +09:00
parent 76d0d86211
commit 91c7e04474
1171 changed files with 81940 additions and 44117 deletions
--- a/venv/lib/python3.12/site-packages/prometheus_client/openmetrics/parser.py
+++ b/venv/lib/python3.12/site-packages/prometheus_client/openmetrics/parser.py
@@ -5,9 +5,14 @@ import io as StringIO
 import math
 import re

-from ..metrics_core import Metric, METRIC_LABEL_NAME_RE
-from ..samples import Exemplar, Sample, Timestamp
+from ..metrics_core import Metric
+from ..parser import (
+    _last_unquoted_char, _next_unquoted_char, _parse_value, _split_quoted,
+    _unquote_unescape, parse_labels,
+)
+from ..samples import BucketSpan, Exemplar, NativeHistogram, Sample, Timestamp
 from ..utils import floatToGoString
+from ..validation import _is_valid_legacy_metric_name, _validate_metric_name


 def text_string_to_metric_families(text):
@@ -73,16 +78,6 @@ def _unescape_help(text):
    return ''.join(result)


-def _parse_value(value):
-    value = ''.join(value)
-    if value != value.strip() or '_' in value:
-        raise ValueError(f"Invalid value: {value!r}")
-    try:
-        return int(value)
-    except ValueError:
-        return float(value)
-
-
 def _parse_timestamp(timestamp):
    timestamp = ''.join(timestamp)
    if not timestamp:
@@ -113,165 +108,31 @@ def _is_character_escaped(s, charpos):
    return num_bslashes % 2 == 1


-def _parse_labels_with_state_machine(text):
-    # The { has already been parsed.
-    state = 'startoflabelname'
-    labelname = []
-    labelvalue = []
-    labels = {}
-    labels_len = 0
-
-    for char in text:
-        if state == 'startoflabelname':
-            if char == '}':
-                state = 'endoflabels'
-            else:
-                state = 'labelname'
-                labelname.append(char)
-        elif state == 'labelname':
-            if char == '=':
-                state = 'labelvaluequote'
-            else:
-                labelname.append(char)
-        elif state == 'labelvaluequote':
-            if char == '"':
-                state = 'labelvalue'
-            else:
-                raise ValueError("Invalid line: " + text)
-        elif state == 'labelvalue':
-            if char == '\\':
-                state = 'labelvalueslash'
-            elif char == '"':
-                ln = ''.join(labelname)
-                if not METRIC_LABEL_NAME_RE.match(ln):
-                    raise ValueError("Invalid line, bad label name: " + text)
-                if ln in labels:
-                    raise ValueError("Invalid line, duplicate label name: " + text)
-                labels[ln] = ''.join(labelvalue)
-                labelname = []
-                labelvalue = []
-                state = 'endoflabelvalue'
-            else:
-                labelvalue.append(char)
-        elif state == 'endoflabelvalue':
-            if char == ',':
-                state = 'labelname'
-            elif char == '}':
-                state = 'endoflabels'
-            else:
-                raise ValueError("Invalid line: " + text)
-        elif state == 'labelvalueslash':
-            state = 'labelvalue'
-            if char == '\\':
-                labelvalue.append('\\')
-            elif char == 'n':
-                labelvalue.append('\n')
-            elif char == '"':
-                labelvalue.append('"')
-            else:
-                labelvalue.append('\\' + char)
-        elif state == 'endoflabels':
-            if char == ' ':
-                break
-            else:
-                raise ValueError("Invalid line: " + text)
-        labels_len += 1
-    return labels, labels_len
-
-
-def _parse_labels(text):
-    labels = {}
-
-    # Raise error if we don't have valid labels
-    if text and "=" not in text:
-        raise ValueError
-
-    # Copy original labels
-    sub_labels = text
-    try:
-        # Process one label at a time
-        while sub_labels:
-            # The label name is before the equal
-            value_start = sub_labels.index("=")
-            label_name = sub_labels[:value_start]
-            sub_labels = sub_labels[value_start + 1:]
-
-            # Check for missing quotes 
-            if not sub_labels or sub_labels[0] != '"':
-                raise ValueError
-
-            # The first quote is guaranteed to be after the equal
-            value_substr = sub_labels[1:]
-
-            # Check for extra commas
-            if not label_name or label_name[0] == ',':
-                raise ValueError
-            if not value_substr or value_substr[-1] == ',':
-                raise ValueError
-
-            # Find the last unescaped quote
-            i = 0
-            while i < len(value_substr):
-                i = value_substr.index('"', i)
-                if not _is_character_escaped(value_substr[:i], i):
-                    break
-                i += 1
-
-            # The label value is between the first and last quote
-            quote_end = i + 1
-            label_value = sub_labels[1:quote_end]
-            # Replace escaping if needed
-            if "\\" in label_value:
-                label_value = _replace_escaping(label_value)
-            if not METRIC_LABEL_NAME_RE.match(label_name):
-                raise ValueError("invalid line, bad label name: " + text)
-            if label_name in labels:
-                raise ValueError("invalid line, duplicate label name: " + text)
-            labels[label_name] = label_value
-
-            # Remove the processed label from the sub-slice for next iteration
-            sub_labels = sub_labels[quote_end + 1:]
-            if sub_labels.startswith(","):
-                next_comma = 1
-            else:
-                next_comma = 0
-            sub_labels = sub_labels[next_comma:]
-
-            # Check for missing commas
-            if sub_labels and next_comma == 0:
-                raise ValueError
-            
-        return labels
-
-    except ValueError:
-        raise ValueError("Invalid labels: " + text)
-
-
 def _parse_sample(text):
    separator = " # "
    # Detect the labels in the text
-    label_start = text.find("{")
+    label_start = _next_unquoted_char(text, '{')
    if label_start == -1 or separator in text[:label_start]:
        # We don't have labels, but there could be an exemplar.
-        name_end = text.index(" ")
+        name_end = _next_unquoted_char(text, ' ')
        name = text[:name_end]
+        if not _is_valid_legacy_metric_name(name):
+            raise ValueError("invalid metric name:" + text)
        # Parse the remaining text after the name
        remaining_text = text[name_end + 1:]
        value, timestamp, exemplar = _parse_remaining_text(remaining_text)
        return Sample(name, {}, value, timestamp, exemplar)
-    # The name is before the labels
    name = text[:label_start]
-    if separator not in text:
-        # Line doesn't contain an exemplar
-        # We can use `rindex` to find `label_end`
-        label_end = text.rindex("}")
-        label = text[label_start + 1:label_end]
-        labels = _parse_labels(label)
-    else:
-        # Line potentially contains an exemplar
-        # Fallback to parsing labels with a state machine
-        labels, labels_len = _parse_labels_with_state_machine(text[label_start + 1:])
-        label_end = labels_len + len(name)
+    label_end = _next_unquoted_char(text, '}')
+    labels = parse_labels(text[label_start + 1:label_end], True)
+    if not name:
+        # Name might be in the labels
+        if '__name__' not in labels:
+            raise ValueError
+        name = labels['__name__']
+        del labels['__name__']
+    elif '__name__' in labels:
+        raise ValueError("metric name specified more than once")
    # Parsing labels succeeded, continue parsing the remaining text
    remaining_text = text[label_end + 2:]
    value, timestamp, exemplar = _parse_remaining_text(remaining_text)
@@ -294,7 +155,12 @@ def _parse_remaining_text(text):
    text = split_text[1]

    it = iter(text)
+    in_quotes = False
    for char in it:
+        if char == '"':
+            in_quotes = not in_quotes
+        if in_quotes:
+            continue
        if state == 'timestamp':
            if char == '#' and not timestamp:
                state = 'exemplarspace'
@@ -314,8 +180,9 @@ def _parse_remaining_text(text):
                raise ValueError("Invalid line: " + text)
        elif state == 'exemplarstartoflabels':
            if char == '{':
-                label_start, label_end = text.index("{"), text.rindex("}")
-                exemplar_labels = _parse_labels(text[label_start + 1:label_end])
+                label_start = _next_unquoted_char(text, '{')
+                label_end = _last_unquoted_char(text, '}')
+                exemplar_labels = parse_labels(text[label_start + 1:label_end], True)
                state = 'exemplarparsedlabels'
            else:
                raise ValueError("Invalid line: " + text)
@@ -364,6 +231,154 @@ def _parse_remaining_text(text):
    return val, ts, exemplar


+def _parse_nh_sample(text, suffixes):
+    """Determines if the line has a native histogram sample, and parses it if so."""
+    labels_start = _next_unquoted_char(text, '{')
+    labels_end = -1
+
+    # Finding a native histogram sample requires careful parsing of
+    # possibly-quoted text, which can appear in metric names, label names, and
+    # values.
+    # 
+    # First, we need to determine if there are metric labels. Find the space
+    # between the metric definition and the rest of the line. Look for unquoted
+    # space or {.
+    i = 0
+    has_metric_labels = False
+    i = _next_unquoted_char(text, ' {')
+    if i == -1:
+        return
+
+    # If the first unquoted char was a {, then that is the metric labels (which
+    # could contain a UTF-8 metric name).
+    if text[i] == '{':
+        has_metric_labels = True
+        # Consume the labels -- jump ahead to the close bracket.
+        labels_end = i = _next_unquoted_char(text, '}', i)
+        if labels_end == -1:
+            raise ValueError
+    
+    # If there is no subsequent unquoted {, then it's definitely not a nh.
+    nh_value_start = _next_unquoted_char(text, '{', i + 1)
+    if nh_value_start == -1:
+        return
+    
+    # Edge case: if there is an unquoted # between the metric definition and the {,
+    # then this is actually an exemplar
+    exemplar = _next_unquoted_char(text, '#', i + 1)
+    if exemplar != -1 and exemplar < nh_value_start:
+        return
+    
+    nh_value_end = _next_unquoted_char(text, '}', nh_value_start)
+    if nh_value_end == -1:
+        raise ValueError
+    
+    if has_metric_labels:
+        labelstext = text[labels_start + 1:labels_end]
+        labels = parse_labels(labelstext, True)
+        name_end = labels_start
+        name = text[:name_end]
+        if name.endswith(suffixes):
+            raise ValueError("the sample name of a native histogram with labels should have no suffixes", name)
+        if not name:
+            # Name might be in the labels
+            if '__name__' not in labels:
+                raise ValueError
+            name = labels['__name__']
+            del labels['__name__']
+            # Edge case: the only "label" is the name definition.
+            if not labels:
+                labels = None
+             
+        nh_value = text[nh_value_start:]
+        nat_hist_value = _parse_nh_struct(nh_value)
+        return Sample(name, labels, None, None, None, nat_hist_value)
+    # check if it's a native histogram
+    else:
+        nh_value = text[nh_value_start:]
+        name_end = nh_value_start - 1
+        name = text[:name_end]
+        if name.endswith(suffixes):
+            raise ValueError("the sample name of a native histogram should have no suffixes", name)
+        # Not possible for UTF-8 name here, that would have been caught as having a labelset.
+        nat_hist_value = _parse_nh_struct(nh_value)
+        return Sample(name, None, None, None, None, nat_hist_value)      
+
+
+def _parse_nh_struct(text):
+    pattern = r'(\w+):\s*([^,}]+)'
+    re_spans = re.compile(r'(positive_spans|negative_spans):\[(\d+:\d+(,\d+:\d+)*)\]')
+    re_deltas = re.compile(r'(positive_deltas|negative_deltas):\[(-?\d+(?:,-?\d+)*)\]')
+
+    items = dict(re.findall(pattern, text))
+    span_matches = re_spans.findall(text)
+    deltas = dict(re_deltas.findall(text))
+
+    count_value = int(items['count'])
+    sum_value = int(items['sum'])
+    schema = int(items['schema'])
+    zero_threshold = float(items['zero_threshold'])
+    zero_count = int(items['zero_count'])
+
+    pos_spans = _compose_spans(span_matches, 'positive_spans')
+    neg_spans = _compose_spans(span_matches, 'negative_spans')
+    pos_deltas = _compose_deltas(deltas, 'positive_deltas')
+    neg_deltas = _compose_deltas(deltas, 'negative_deltas')
+      
+    return NativeHistogram(
+        count_value=count_value,
+        sum_value=sum_value,
+        schema=schema,
+        zero_threshold=zero_threshold,
+        zero_count=zero_count,
+        pos_spans=pos_spans,
+        neg_spans=neg_spans,
+        pos_deltas=pos_deltas,
+        neg_deltas=neg_deltas
+    )
+  
+
+def _compose_spans(span_matches, spans_name):
+    """Takes a list of span matches (expected to be a list of tuples) and a string 
+    (the expected span list name) and processes the list so that the values extracted 
+    from the span matches can be used to compose a tuple of BucketSpan objects"""
+    spans = {}
+    for match in span_matches:
+        # Extract the key from the match (first element of the tuple).
+        key = match[0]
+        # Extract the value from the match (second element of the tuple).
+        # Split the value string by commas to get individual pairs, 
+        # split each pair by ':' to get start and end, and convert them to integers.
+        value = [tuple(map(int, pair.split(':'))) for pair in match[1].split(',')]
+        # Store the processed value in the spans dictionary with the key.
+        spans[key] = value
+    if spans_name not in spans:
+        return None
+    out_spans = []
+    # Iterate over each start and end tuple in the list of tuples for the specified spans_name.
+    for start, end in spans[spans_name]:
+        # Compose a BucketSpan object with the start and end values 
+        # and append it to the out_spans list.
+        out_spans.append(BucketSpan(start, end))
+        # Convert to tuple
+    out_spans_tuple = tuple(out_spans)
+    return out_spans_tuple
+
+
+def _compose_deltas(deltas, deltas_name):
+    """Takes a list of deltas matches (a dictionary) and a string (the expected delta list name),
+    and processes its elements to compose a tuple of integers representing the deltas"""
+    if deltas_name not in deltas:
+        return None
+    out_deltas = deltas.get(deltas_name)
+    if out_deltas is not None and out_deltas.strip():
+        elems = out_deltas.split(',')
+    # Convert each element in the list elems to an integer 
+    # after stripping whitespace and create a tuple from these integers.
+    out_deltas_tuple = tuple(int(x.strip()) for x in elems)
+    return out_deltas_tuple
+        
+
 def _group_for_sample(sample, name, typ):
    if typ == 'info':
        # We can't distinguish between groups for info metrics.
@@ -406,6 +421,8 @@ def _check_histogram(samples, name):
    for s in samples:
        suffix = s.name[len(name):]
        g = _group_for_sample(s, name, 'histogram')
+        if len(suffix) == 0:
+            continue
        if g != group or s.timestamp != timestamp:
            if group is not None:
                do_checks()
@@ -481,11 +498,14 @@ def text_fd_to_metric_families(fd):
            raise ValueError("Units not allowed for this metric type: " + name)
        if typ in ['histogram', 'gaugehistogram']:
            _check_histogram(samples, name)
+        _validate_metric_name(name)
        metric = Metric(name, documentation, typ, unit)
        # TODO: check labelvalues are valid utf8
        metric.samples = samples
        return metric

+    is_nh = False
+    typ = None
    for line in fd:
        if line[-1] == '\n':
            line = line[:-1]
@@ -499,16 +519,19 @@ def text_fd_to_metric_families(fd):
        if line == '# EOF':
            eof = True
        elif line.startswith('#'):
-            parts = line.split(' ', 3)
+            parts = _split_quoted(line, ' ', 3)
            if len(parts) < 4:
                raise ValueError("Invalid line: " + line)
-            if parts[2] == name and samples:
+            candidate_name, quoted = _unquote_unescape(parts[2])
+            if not quoted and not _is_valid_legacy_metric_name(candidate_name):
+                raise ValueError
+            if candidate_name == name and samples:
                raise ValueError("Received metadata after samples: " + line)
-            if parts[2] != name:
+            if candidate_name != name:
                if name is not None:
                    yield build_metric(name, documentation, typ, unit, samples)
                # New metric
-                name = parts[2]
+                name = candidate_name
                unit = None
                typ = None
                documentation = None
@@ -517,8 +540,8 @@ def text_fd_to_metric_families(fd):
                group_timestamp = None
                group_timestamp_samples = set()
                samples = []
-                allowed_names = [parts[2]]
-
+                allowed_names = [candidate_name]
+            
            if parts[1] == 'HELP':
                if documentation is not None:
                    raise ValueError("More than one HELP for metric: " + line)
@@ -537,12 +560,25 @@ def text_fd_to_metric_families(fd):
            else:
                raise ValueError("Invalid line: " + line)
        else:
-            sample = _parse_sample(line)
-            if sample.name not in allowed_names:
+            if typ == 'histogram':
+                # set to true to account for native histograms naming exceptions/sanitizing differences
+                is_nh = True
+                sample = _parse_nh_sample(line, tuple(type_suffixes['histogram']))
+                # It's not a native histogram
+                if sample is None:
+                    is_nh = False
+                    sample = _parse_sample(line)              
+            else:
+                is_nh = False
+                sample = _parse_sample(line)
+            if sample.name not in allowed_names and not is_nh:
                if name is not None:
                    yield build_metric(name, documentation, typ, unit, samples)
                # Start an unknown metric.
-                name = sample.name
+                candidate_name, quoted = _unquote_unescape(sample.name)
+                if not quoted and not _is_valid_legacy_metric_name(candidate_name):
+                    raise ValueError
+                name = candidate_name
                documentation = None
                unit = None
                typ = 'unknown'
@@ -570,26 +606,29 @@ def text_fd_to_metric_families(fd):
                         or _isUncanonicalNumber(sample.labels['quantile']))):
                raise ValueError("Invalid quantile label: " + line)

-            g = tuple(sorted(_group_for_sample(sample, name, typ).items()))
-            if group is not None and g != group and g in seen_groups:
-                raise ValueError("Invalid metric grouping: " + line)
-            if group is not None and g == group:
-                if (sample.timestamp is None) != (group_timestamp is None):
-                    raise ValueError("Mix of timestamp presence within a group: " + line)
-                if group_timestamp is not None and group_timestamp > sample.timestamp and typ != 'info':
-                    raise ValueError("Timestamps went backwards within a group: " + line)
+            if not is_nh:
+                g = tuple(sorted(_group_for_sample(sample, name, typ).items()))
+                if group is not None and g != group and g in seen_groups:
+                    raise ValueError("Invalid metric grouping: " + line)
+                if group is not None and g == group:
+                    if (sample.timestamp is None) != (group_timestamp is None):
+                        raise ValueError("Mix of timestamp presence within a group: " + line)
+                    if group_timestamp is not None and group_timestamp > sample.timestamp and typ != 'info':
+                        raise ValueError("Timestamps went backwards within a group: " + line)
+                else:
+                    group_timestamp_samples = set()
+
+                series_id = (sample.name, tuple(sorted(sample.labels.items())))
+                if sample.timestamp != group_timestamp or series_id not in group_timestamp_samples:
+                    # Not a duplicate due to timestamp truncation.
+                    samples.append(sample)
+                group_timestamp_samples.add(series_id)
+
+                group = g
+                group_timestamp = sample.timestamp
+                seen_groups.add(g)
            else:
-                group_timestamp_samples = set()
-
-            series_id = (sample.name, tuple(sorted(sample.labels.items())))
-            if sample.timestamp != group_timestamp or series_id not in group_timestamp_samples:
-                # Not a duplicate due to timestamp truncation.
                samples.append(sample)
-            group_timestamp_samples.add(series_id)
-
-            group = g
-            group_timestamp = sample.timestamp
-            seen_groups.add(g)

            if typ == 'stateset' and sample.value not in [0, 1]:
                raise ValueError("Stateset samples can only have values zero and one: " + line)
@@ -606,7 +645,7 @@ def text_fd_to_metric_families(fd):
                    (typ in ['histogram', 'gaugehistogram'] and sample.name.endswith('_bucket'))
                    or (typ in ['counter'] and sample.name.endswith('_total'))):
                raise ValueError("Invalid line only histogram/gaugehistogram buckets and counters can have exemplars: " + line)
-
+    
    if name is not None:
        yield build_metric(name, documentation, typ, unit, samples)