Re: [PR] feat(connectors): Implement influxdb v2 and v3 connector with separate source and sink crates [iggy]

via GitHub Wed, 13 May 2026 02:07:24 -0700


ryerraguntla commented on code in PR #3140:
URL: https://github.com/apache/iggy/pull/3140#discussion_r3232891046



##########
core/connectors/sources/influxdb_source/src/common.rs:
##########
@@ -0,0 +1,1060 @@
+/* Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+use iggy_common::serde_secret::serialize_secret;
+use iggy_common::{DateTime, Utc};
+use iggy_connector_sdk::{Error, Schema};
+use secrecy::SecretString;
+use serde::{Deserialize, Serialize};
+use std::sync::OnceLock;
+use tracing::warn;
+
+pub(crate) use crate::row::{Row, parse_csv_rows, parse_jsonl_rows};
+
+// ── Constants 
─────────────────────────────────────────────────────────────────
+
+/// Default cursor column for V2 (Flux annotated-CSV timestamp annotation).
+pub(crate) const DEFAULT_V2_CURSOR_FIELD: &str = "_time";
+/// Default cursor column for V3 (SQL timestamp column name).
+pub(crate) const DEFAULT_V3_CURSOR_FIELD: &str = "time";
+
+// ── Config 
────────────────────────────────────────────────────────────────────
+//
+// Uses `#[serde(tag = "version")]` instead of `#[serde(flatten)]` because
+// serde's flatten interacts poorly with tagged enums — the tag field can be
+// consumed before the variant content is parsed, causing deserialization to 
fail.
+
+#[derive(Debug, Clone, Serialize)]
+#[serde(tag = "version")]
+pub enum InfluxDbSourceConfig {
+    #[serde(rename = "v2")]
+    V2(V2SourceConfig),
+    #[serde(rename = "v3")]
+    V3(V3SourceConfig),
+}
+
+/// Deserializes `InfluxDbSourceConfig` with backward-compatible version 
defaulting.
+///
+/// Existing V2 configs that omit the `version` field are treated as `"v2"` so
+/// deployments can upgrade without touching their config files. Explicitly
+/// unknown version strings are rejected with a clear error.
+impl<'de> serde::Deserialize<'de> for InfluxDbSourceConfig {
+    fn deserialize<D: serde::Deserializer<'de>>(d: D) -> Result<Self, 
D::Error> {
+        let raw = serde_json::Value::deserialize(d)?;
+        let version = match raw.get("version") {
+            None => "v2", // absent key → backward compat default
+            Some(v) => v.as_str().ok_or_else(|| {
+                serde::de::Error::custom(format!(
+                    "\"version\" must be a string (e.g. \"v2\" or \"v3\"), 
got: {v}"
+                ))
+            })?,
+        };
+        match version {
+            "v2" => serde_json::from_value::<V2SourceConfig>(raw)
+                .map(Self::V2)
+                .map_err(serde::de::Error::custom),
+            "v3" => serde_json::from_value::<V3SourceConfig>(raw)
+                .map(Self::V3)
+                .map_err(serde::de::Error::custom),
+            other => Err(serde::de::Error::custom(format!(
+                "unknown InfluxDB version {other:?}; expected \"v2\" or \"v3\""
+            ))),
+        }
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct V2SourceConfig {
+    pub(crate) url: String,
+    pub(crate) org: String,
+    #[serde(serialize_with = "serialize_secret")]
+    pub(crate) token: SecretString,
+    pub(crate) query: String,
+    pub(crate) poll_interval: Option<String>,
+    pub(crate) batch_size: Option<u32>,
+    pub(crate) cursor_field: Option<String>,
+    pub(crate) initial_offset: Option<String>,
+    pub(crate) payload_column: Option<String>,
+    pub(crate) payload_format: Option<String>,
+    pub(crate) include_metadata: Option<bool>,
+    pub(crate) verbose_logging: Option<bool>,
+    pub(crate) max_retries: Option<u32>,
+    pub(crate) retry_delay: Option<String>,
+    pub(crate) timeout: Option<String>,
+    pub(crate) max_open_retries: Option<u32>,
+    pub(crate) open_retry_max_delay: Option<String>,
+    pub(crate) retry_max_delay: Option<String>,
+    pub(crate) circuit_breaker_threshold: Option<u32>,
+    pub(crate) circuit_breaker_cool_down: Option<String>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct V3SourceConfig {
+    pub(crate) url: String,
+    pub(crate) db: String,
+    #[serde(serialize_with = "serialize_secret")]
+    pub(crate) token: SecretString,
+    pub(crate) query: String,
+    pub(crate) poll_interval: Option<String>,
+    pub(crate) batch_size: Option<u32>,
+    pub(crate) cursor_field: Option<String>,
+    pub(crate) initial_offset: Option<String>,
+    pub(crate) payload_column: Option<String>,
+    pub(crate) payload_format: Option<String>,
+    /// When `false`, the cursor column (`time` by default) is excluded from 
the
+    /// emitted JSON payload. Useful when consumers don't need the timestamp in
+    /// the message body since it's available as message metadata.
+    pub(crate) include_metadata: Option<bool>,
+    pub(crate) verbose_logging: Option<bool>,
+    pub(crate) max_retries: Option<u32>,
+    pub(crate) retry_delay: Option<String>,
+    pub(crate) timeout: Option<String>,
+    pub(crate) max_open_retries: Option<u32>,
+    pub(crate) open_retry_max_delay: Option<String>,
+    pub(crate) retry_max_delay: Option<String>,
+    pub(crate) circuit_breaker_threshold: Option<u32>,
+    pub(crate) circuit_breaker_cool_down: Option<String>,
+    /// Maximum factor by which batch_size may be inflated before the 
stuck-timestamp
+    /// circuit breaker trips. Defaults to 10 (i.e. up to 10× the configured 
batch_size).
+    /// Maximum accepted value is 100; higher values risk OOM-inducing queries.
+    pub(crate) stuck_batch_cap_factor: Option<u32>,
+}
+
+// Eliminates the repetitive "match self { V2(c) => …, V3(c) => … }" pattern 
for
+// fields that are identical across all config variants. Methods with 
version-specific
+// logic (cursor_field, max_retries, version_label) remain explicit.
+//
+// Supported patterns:
+//   delegate!(ref  self.url)                        →  &String (borrow)
+//   delegate!(opt  self.poll_interval)              →  Option<&str>
+//   delegate!(unwrap self.batch_size, 500)          →  T: Copy with value 
fallback
+//
+// Not supported (use explicit match arms instead):
+//   Fields with version-specific defaults (e.g. cursor_field: "_time" vs 
"time")
+//   Fields with chained transformations (e.g. max_retries + .max(1))
+//   Fields that only exist on one variant (e.g. V3's stuck_batch_cap_factor)
+macro_rules! delegate {
+    // &T field reference  →  fn foo(&self) -> &T
+    (ref $self:ident . $field:ident) => {
+        match $self {
+            Self::V2(c) => &c.$field,
+            Self::V3(c) => &c.$field,
+        }
+    };
+    // Option<String>  →  Option<&str>
+    (opt $self:ident . $field:ident) => {
+        match $self {
+            Self::V2(c) => c.$field.as_deref(),
+            Self::V3(c) => c.$field.as_deref(),
+        }
+    };
+    // Option<T: Copy>  →  T with fallback
+    (unwrap $self:ident . $field:ident, $default:expr) => {
+        match $self {
+            Self::V2(c) => c.$field.unwrap_or($default),
+            Self::V3(c) => c.$field.unwrap_or($default),
+        }
+    };
+}
+
+impl InfluxDbSourceConfig {
+    pub fn url(&self) -> &str {
+        delegate!(ref    self.url)
+    }
+    pub fn token_secret(&self) -> &SecretString {
+        delegate!(ref    self.token)
+    }
+    pub fn poll_interval(&self) -> Option<&str> {
+        delegate!(opt    self.poll_interval)
+    }
+    pub fn batch_size(&self) -> u32 {
+        // Floor at 1 — callers build LIMIT $limit queries; LIMIT 0 stalls 
silently.
+        // open() also rejects 0 explicitly, but defense-in-depth here costs 
nothing.
+        delegate!(unwrap self.batch_size, 500).max(1)
+    }
+    pub fn initial_offset(&self) -> Option<&str> {
+        delegate!(opt    self.initial_offset)
+    }
+    pub fn payload_column(&self) -> Option<&str> {
+        delegate!(opt    self.payload_column)
+    }
+    pub fn payload_format(&self) -> Option<&str> {
+        delegate!(opt    self.payload_format)
+    }
+    pub fn verbose_logging(&self) -> bool {
+        delegate!(unwrap self.verbose_logging, false)
+    }
+    pub fn retry_delay(&self) -> Option<&str> {
+        delegate!(opt    self.retry_delay)
+    }
+    pub fn timeout(&self) -> Option<&str> {
+        delegate!(opt    self.timeout)
+    }
+    pub fn max_open_retries(&self) -> u32 {
+        delegate!(unwrap self.max_open_retries, 10)
+    }
+    pub fn open_retry_max_delay(&self) -> Option<&str> {
+        delegate!(opt  self.open_retry_max_delay)
+    }
+    pub fn retry_max_delay(&self) -> Option<&str> {
+        delegate!(opt    self.retry_max_delay)
+    }
+    pub fn circuit_breaker_threshold(&self) -> u32 {
+        delegate!(unwrap self.circuit_breaker_threshold, 5)
+    }
+    pub fn circuit_breaker_cool_down(&self) -> Option<&str> {
+        delegate!(opt self.circuit_breaker_cool_down)
+    }
+
+    // V2 and V3 use different default cursor column names.
+    pub fn cursor_field(&self) -> &str {
+        match self {
+            Self::V2(c) => 
c.cursor_field.as_deref().unwrap_or(DEFAULT_V2_CURSOR_FIELD),
+            Self::V3(c) => 
c.cursor_field.as_deref().unwrap_or(DEFAULT_V3_CURSOR_FIELD),
+        }
+    }
+
+    pub fn include_metadata(&self) -> bool {
+        delegate!(unwrap self.include_metadata, true)
+    }
+
+    // Both arms are identical; `delegate!` is not used because the `.max(1)` 
chain
+    // cannot be expressed in the macro without adding a new variant.
+    pub fn max_retries(&self) -> u32 {
+        match self {
+            Self::V2(c) => c.max_retries.unwrap_or(3).max(1),
+            Self::V3(c) => c.max_retries.unwrap_or(3).max(1),
+        }
+    }
+
+    pub fn version_label(&self) -> &'static str {
+        match self {
+            Self::V2(_) => "v2",
+            Self::V3(_) => "v3",
+        }
+    }
+
+    /// URL with any trailing slash stripped — used as the base for all 
endpoint URLs.
+    pub(crate) fn base_url(&self) -> &str {
+        self.url().trim_end_matches('/')
+    }
+}
+
+// ── Row processing context 
────────────────────────────────────────────────────
+
+/// Per-poll fields that are constant across all rows in a batch.
+/// Passed by reference to `process_rows` so the function signature stays at ≤ 
3 parameters.
+#[derive(Debug, Clone, Copy)]
+pub(crate) struct RowContext<'a> {
+    pub cursor_field: &'a str,
+    pub current_cursor: &'a str,
+    pub include_metadata: bool,
+    pub payload_col: Option<&'a str>,
+    pub payload_format: PayloadFormat,
+    pub now_micros: u64,
+}
+
+// ── Persisted state 
───────────────────────────────────────────────────────────
+
+#[derive(Debug, Serialize, Deserialize)]
+#[serde(tag = "version")]
+pub enum PersistedState {
+    #[serde(rename = "v2")]
+    V2(V2State),
+    #[serde(rename = "v3")]
+    V3(V3State),
+}
+
+#[derive(Debug, Default, Clone, Serialize, Deserialize)]
+pub struct V2State {
+    pub last_timestamp: Option<String>,
+    pub processed_rows: u64,
+    /// Rows at `last_timestamp` already delivered; used to skip them when the
+    /// Flux query uses `>= $cursor` and a batch boundary lands mid-timestamp.
+    pub cursor_row_count: u64,
+}
+
+#[derive(Debug, Default, Clone, Serialize, Deserialize)]
+pub struct V3State {
+    pub last_timestamp: Option<String>,
+    pub processed_rows: u64,
+    /// Current effective batch size after stuck-timestamp inflation.
+    /// Reset to the configured base value when the cursor advances.
+    pub effective_batch_size: u32,
+    /// Row offset within the last timestamp group — used as a tiebreaker
+    /// so that siblings at the same timestamp are not silently dropped.
+    pub last_timestamp_row_offset: u64,
+}
+
+// ── Payload format 
────────────────────────────────────────────────────────────
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
+pub enum PayloadFormat {
+    #[default]
+    Json,
+    Text,
+    Raw,
+}
+
+impl PayloadFormat {
+    pub fn from_config(value: Option<&str>) -> Self {
+        match value.map(|v| v.to_ascii_lowercase()).as_deref() {
+            Some("text") | Some("utf8") => PayloadFormat::Text,
+            Some("raw") | Some("base64") => PayloadFormat::Raw,
+            Some("json") => PayloadFormat::Json,
+            other => {
+                if other.is_some() {
+                    warn!(
+                        "Unrecognized payload_format {:?}, falling back to 
JSON",
+                        other
+                    );
+                }
+                PayloadFormat::Json
+            }
+        }
+    }
+
+    pub fn schema(self) -> Schema {
+        match self {
+            PayloadFormat::Json => Schema::Json,
+            PayloadFormat::Text => Schema::Text,
+            PayloadFormat::Raw => Schema::Raw,
+        }
+    }
+}
+
+// ── Cursor validation 
─────────────────────────────────────────────────────────
+
+static CURSOR_RE: OnceLock<regex::Regex> = OnceLock::new();
+
+pub fn cursor_re() -> &'static regex::Regex {
+    CURSOR_RE.get_or_init(|| {
+        // Validates RFC 3339 timestamp structure with proper field ranges:
+        // month 01-12, day 01-31, hour 00-23, minute/second 00-59.
+        // Timezone suffix is required: a naive timestamp without Z or +HH:MM
+        // is rejected to prevent silent UTC-vs-local ambiguity between V2 
(Flux
+        // always treats timestamps as UTC) and V3 (SQL engine timezone depends
+        // on server config).
+        // Note: day 29-31 validity for a given month is not checked by the 
regex;
+        // chrono parsing inside validate_cursor handles that for tz-aware 
timestamps.
+        regex::Regex::new(
+            
r"(?-u)^\d{4}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])T([01]\d|2[0-3]):[0-5]\d:[0-5]\d(\.\d+)?(Z|[+-]\d{2}:\d{2})$"
+        )
+        .expect("hardcoded regex is valid")
+    })
+}
+
+pub fn validate_cursor(cursor: &str) -> Result<(), Error> {
+    if !cursor_re().is_match(cursor) {
+        return Err(Error::InvalidConfigValue(format!(
+            "cursor value {cursor:?} is not a valid RFC 3339 timestamp"
+        )));
+    }
+    // Chain chrono parse to catch calendar-invalid dates (e.g. Feb 30)
+    chrono::DateTime::parse_from_rfc3339(cursor).map_err(|e| {
+        Error::InvalidConfigValue(format!(
+            "cursor value {cursor:?} failed chrono validation: {e}"
+        ))
+    })?;
+    Ok(())
+}
+
+/// Validate `cursor_field` for the given connector version.
+///
+/// `version` should be `"v2"` or `"v3"`. The function is version-strict: 
`"_time"`
+/// is only valid for V2 (Flux annotation column) and `"time"` is only valid 
for V3
+/// (SQL timestamp column). Swapping them silently would produce empty result 
sets
+/// or query errors at the InfluxDB level.
+pub fn validate_cursor_field(field: &str, version: &str) -> Result<(), Error> {
+    if field.is_empty() {
+        return Err(Error::InvalidConfigValue(format!(
+            "cursor_field must not be empty for {version} — \
+             use \"_time\" for v2 or \"time\" for v3"
+        )));
+    }
+    match (field, version) {
+        ("time", "v2") => Err(Error::InvalidConfigValue(
+            "cursor_field \"time\" is not valid for v2 — use \"_time\" \
+             (the Flux annotated-CSV timestamp column)"
+                .into(),
+        )),
+        ("_time", "v3") => Err(Error::InvalidConfigValue(
+            "cursor_field \"_time\" is not valid for v3 — use \"time\" \
+             (the SQL timestamp column)"
+                .into(),
+        )),
+        // Allow everything else — custom column names are valid
+        _ => Ok(()),
+    }
+}
+
+// ── Timestamp helpers 
─────────────────────────────────────────────────────────
+
+/// Return `true` if timestamp string `a` is strictly after the pre-parsed `b`.
+///
+/// `b` is accepted as an already-parsed `DateTime<Utc>` so callers that 
compare
+/// against the same cursor on every row in a batch parse it once, not O(n) 
times.
+/// `a` is parsed on each call. Returns `false` conservatively when `a` fails 
to
+/// parse — do NOT advance the cursor when comparison is ambiguous. 
Lexicographic
+/// comparison is incorrect for timestamps with different timezone offsets
+/// (e.g. `+05:30` vs `Z`) and would silently produce wrong cursor advancement.
+pub fn is_timestamp_after(a: &str, b_parsed: DateTime<Utc>) -> bool {
+    match a.parse::<DateTime<Utc>>() {
+        Ok(dt_a) => dt_a > b_parsed,
+        Err(_) => {
+            warn!(
+                "is_timestamp_after: could not parse {a:?} as RFC 3339; \
+                 refusing to advance cursor"
+            );
+            false
+        }
+    }
+}
+
+/// Return `true` if timestamps `a` and `b` represent the same instant,
+/// regardless of timezone format differences.
+///
+/// Raw string equality is wrong here: `"2024-01-01T00:00:00Z"` and
+/// `"2024-01-01T00:00:00+00:00"` are the same instant but differ lexically.
+/// This causes `all_at_cursor` to flip `false` incorrectly for one poll round,
+/// producing duplicate delivery that self-heals next poll once the cursor
+/// string is overwritten.
+///
+/// Falls back to string equality if either value fails to parse — 
conservative,
+/// avoids a false "not equal" that would produce unnecessary duplicates.
+pub(crate) fn timestamps_equal(a: &str, b: &str) -> bool {
+    match (a.parse::<DateTime<Utc>>(), b.parse::<DateTime<Utc>>()) {
+        (Ok(dt_a), Ok(dt_b)) => dt_a == dt_b,
+        _ => {
+            warn!(
+                "timestamps_equal: could not parse timestamps as RFC 3339 \
+                 ({a:?} vs {b:?}); falling back to string equality"
+            );
+            a == b
+        }
+    }
+}
+// ── Scalar parsing 
────────────────────────────────────────────────────────────
+
+/// Parse a string value from InfluxDB into the most specific JSON scalar type.
+///
+/// Tries `bool`, then `i64`, then `f64`; falls back to `String`. An empty
+/// string becomes `null`. `NaN` and `±Infinity` are emitted as strings because
+/// JSON has no representation for non-finite floats
+/// (`serde_json::Number::from_f64` returns `None` for them).
+pub fn parse_scalar(value: &str) -> serde_json::Value {
+    if value.is_empty() {
+        return serde_json::Value::Null;
+    }
+    if let Ok(v) = value.parse::<bool>() {
+        return serde_json::Value::Bool(v);
+    }
+    if let Ok(v) = value.parse::<i64>() {
+        return serde_json::Value::Number(v.into());
+    }
+    if let Ok(v) = value.parse::<f64>()
+        && let Some(number) = serde_json::Number::from_f64(v)
+    {
+        return serde_json::Value::Number(number);
+    }
+    serde_json::Value::String(value.to_string())
+}
+
+// ── Query template substitution 
───────────────────────────────────────────────
+
+/// Substitute `$cursor` and `$limit` placeholders in a query template in a
+/// single pass, avoiding the two intermediate `String` allocations that
+/// `clone() + replace() + replace()` would produce.
+pub(crate) fn apply_query_params(

Review Comment:
   good one. Resolved,



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] feat(connectors): Implement influxdb v2 and v3 connector with separate source and sink crates [iggy]

Reply via email to