ryerraguntla commented on code in PR #3140:
URL: https://github.com/apache/iggy/pull/3140#discussion_r3142607236


##########
core/connectors/sources/influxdb_source/src/common.rs:
##########
@@ -0,0 +1,826 @@
+/* Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+use iggy_common::serde_secret::serialize_secret;
+use iggy_common::{DateTime, Utc};
+use iggy_connector_sdk::{Error, Schema};
+use secrecy::SecretString;
+use serde::{Deserialize, Serialize};
+use std::sync::OnceLock;
+use tracing::warn;
+
+pub(crate) use crate::row::{Row, parse_csv_rows, parse_jsonl_rows};
+
+// ── Constants 
─────────────────────────────────────────────────────────────────
+
+/// Default cursor column for V2 (Flux annotated-CSV timestamp annotation).
+pub(crate) const DEFAULT_V2_CURSOR_FIELD: &str = "_time";
+/// Default cursor column for V3 (SQL timestamp column name).
+pub(crate) const DEFAULT_V3_CURSOR_FIELD: &str = "time";
+
+// ── Config 
────────────────────────────────────────────────────────────────────
+//
+// Uses `#[serde(tag = "version")]` instead of `#[serde(flatten)]` because
+// serde's flatten interacts poorly with tagged enums — the tag field can be
+// consumed before the variant content is parsed, causing deserialization to 
fail.
+
+#[derive(Debug, Clone, Serialize)]
+#[serde(tag = "version")]
+pub enum InfluxDbSourceConfig {
+    #[serde(rename = "v2")]
+    V2(V2SourceConfig),
+    #[serde(rename = "v3")]
+    V3(V3SourceConfig),
+}
+
+/// Deserializes `InfluxDbSourceConfig` with backward-compatible version 
defaulting.
+///
+/// Existing V2 configs that omit the `version` field are treated as `"v2"` so
+/// deployments can upgrade without touching their config files. Explicitly
+/// unknown version strings are rejected with a clear error.
+impl<'de> serde::Deserialize<'de> for InfluxDbSourceConfig {
+    fn deserialize<D: serde::Deserializer<'de>>(d: D) -> Result<Self, 
D::Error> {
+        let raw = serde_json::Value::deserialize(d)?;
+        let version = raw.get("version").and_then(|v| 
v.as_str()).unwrap_or("v2");
+        match version {
+            "v2" => serde_json::from_value::<V2SourceConfig>(raw)
+                .map(Self::V2)
+                .map_err(serde::de::Error::custom),
+            "v3" => serde_json::from_value::<V3SourceConfig>(raw)
+                .map(Self::V3)
+                .map_err(serde::de::Error::custom),
+            other => Err(serde::de::Error::custom(format!(
+                "unknown InfluxDB version {other:?}; expected \"v2\" or \"v3\""
+            ))),
+        }
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct V2SourceConfig {
+    pub(crate) url: String,
+    pub(crate) org: String,
+    #[serde(serialize_with = "serialize_secret")]
+    pub(crate) token: SecretString,
+    pub(crate) query: String,
+    pub(crate) poll_interval: Option<String>,
+    pub(crate) batch_size: Option<u32>,
+    pub(crate) cursor_field: Option<String>,
+    pub(crate) initial_offset: Option<String>,
+    pub(crate) payload_column: Option<String>,
+    pub(crate) payload_format: Option<String>,
+    pub(crate) include_metadata: Option<bool>,
+    pub(crate) verbose_logging: Option<bool>,
+    pub(crate) max_retries: Option<u32>,
+    pub(crate) retry_delay: Option<String>,
+    pub(crate) timeout: Option<String>,
+    pub(crate) max_open_retries: Option<u32>,
+    pub(crate) open_retry_max_delay: Option<String>,
+    pub(crate) retry_max_delay: Option<String>,
+    pub(crate) circuit_breaker_threshold: Option<u32>,
+    pub(crate) circuit_breaker_cool_down: Option<String>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct V3SourceConfig {
+    pub(crate) url: String,
+    pub(crate) db: String,
+    #[serde(serialize_with = "serialize_secret")]
+    pub(crate) token: SecretString,
+    pub(crate) query: String,
+    pub(crate) poll_interval: Option<String>,
+    pub(crate) batch_size: Option<u32>,
+    pub(crate) cursor_field: Option<String>,
+    pub(crate) initial_offset: Option<String>,
+    pub(crate) payload_column: Option<String>,
+    pub(crate) payload_format: Option<String>,
+    /// When `false`, the cursor column (`time` by default) is excluded from 
the
+    /// emitted JSON payload. Useful when consumers don't need the timestamp in
+    /// the message body since it's available as message metadata.
+    pub(crate) include_metadata: Option<bool>,
+    pub(crate) verbose_logging: Option<bool>,
+    pub(crate) max_retries: Option<u32>,
+    pub(crate) retry_delay: Option<String>,
+    pub(crate) timeout: Option<String>,
+    pub(crate) max_open_retries: Option<u32>,
+    pub(crate) open_retry_max_delay: Option<String>,
+    pub(crate) retry_max_delay: Option<String>,
+    pub(crate) circuit_breaker_threshold: Option<u32>,
+    pub(crate) circuit_breaker_cool_down: Option<String>,
+    /// Maximum factor by which batch_size may be inflated before the 
stuck-timestamp
+    /// circuit breaker trips. Defaults to 10 (i.e. up to 10× the configured 
batch_size).
+    /// Maximum accepted value is 100; higher values risk OOM-inducing queries.
+    pub(crate) stuck_batch_cap_factor: Option<u32>,
+}
+
+// Eliminates the repetitive "match self { V2(c) => …, V3(c) => … }" pattern 
for
+// fields that are identical across all config variants. Methods with 
version-specific
+// logic (cursor_field, max_retries, version_label) remain explicit.
+//
+// Supported patterns:
+//   delegate!(ref  self.url)                        →  &String (borrow)
+//   delegate!(opt  self.poll_interval)              →  Option<&str>
+//   delegate!(unwrap self.batch_size, 500)          →  T: Copy with value 
fallback
+//
+// Not supported (use explicit match arms instead):
+//   Fields with version-specific defaults (e.g. cursor_field: "_time" vs 
"time")
+//   Fields with chained transformations (e.g. max_retries + .max(1))
+//   Fields that only exist on one variant (e.g. V3's stuck_batch_cap_factor)
+macro_rules! delegate {
+    // &T field reference  →  fn foo(&self) -> &T
+    (ref $self:ident . $field:ident) => {
+        match $self {
+            Self::V2(c) => &c.$field,
+            Self::V3(c) => &c.$field,
+        }
+    };
+    // Option<String>  →  Option<&str>
+    (opt $self:ident . $field:ident) => {
+        match $self {
+            Self::V2(c) => c.$field.as_deref(),
+            Self::V3(c) => c.$field.as_deref(),
+        }
+    };
+    // Option<T: Copy>  →  T with fallback
+    (unwrap $self:ident . $field:ident, $default:expr) => {
+        match $self {
+            Self::V2(c) => c.$field.unwrap_or($default),
+            Self::V3(c) => c.$field.unwrap_or($default),
+        }
+    };
+}
+
+impl InfluxDbSourceConfig {
+    pub fn url(&self) -> &str {
+        delegate!(ref    self.url)
+    }
+    pub fn token_secret(&self) -> &SecretString {
+        delegate!(ref    self.token)
+    }
+    pub fn poll_interval(&self) -> Option<&str> {
+        delegate!(opt    self.poll_interval)
+    }
+    pub fn batch_size(&self) -> u32 {
+        delegate!(unwrap self.batch_size, 500)
+    }
+    pub fn initial_offset(&self) -> Option<&str> {
+        delegate!(opt    self.initial_offset)
+    }
+    pub fn payload_column(&self) -> Option<&str> {
+        delegate!(opt    self.payload_column)
+    }
+    pub fn payload_format(&self) -> Option<&str> {
+        delegate!(opt    self.payload_format)
+    }
+    pub fn verbose_logging(&self) -> bool {
+        delegate!(unwrap self.verbose_logging, false)
+    }
+    pub fn retry_delay(&self) -> Option<&str> {
+        delegate!(opt    self.retry_delay)
+    }
+    pub fn timeout(&self) -> Option<&str> {
+        delegate!(opt    self.timeout)
+    }
+    pub fn max_open_retries(&self) -> u32 {
+        delegate!(unwrap self.max_open_retries, 10)
+    }
+    pub fn open_retry_max_delay(&self) -> Option<&str> {
+        delegate!(opt  self.open_retry_max_delay)
+    }
+    pub fn retry_max_delay(&self) -> Option<&str> {
+        delegate!(opt    self.retry_max_delay)
+    }
+    pub fn circuit_breaker_threshold(&self) -> u32 {
+        delegate!(unwrap self.circuit_breaker_threshold, 5)
+    }
+    pub fn circuit_breaker_cool_down(&self) -> Option<&str> {
+        delegate!(opt self.circuit_breaker_cool_down)
+    }
+
+    // V2 and V3 use different default cursor column names.
+    pub fn cursor_field(&self) -> &str {
+        match self {
+            Self::V2(c) => 
c.cursor_field.as_deref().unwrap_or(DEFAULT_V2_CURSOR_FIELD),
+            Self::V3(c) => 
c.cursor_field.as_deref().unwrap_or(DEFAULT_V3_CURSOR_FIELD),
+        }
+    }
+
+    pub fn include_metadata(&self) -> bool {
+        delegate!(unwrap self.include_metadata, true)
+    }
+
+    // Both arms are identical; `delegate!` is not used because the `.max(1)` 
chain
+    // cannot be expressed in the macro without adding a new variant.
+    pub fn max_retries(&self) -> u32 {
+        match self {
+            Self::V2(c) => c.max_retries.unwrap_or(3).max(1),
+            Self::V3(c) => c.max_retries.unwrap_or(3).max(1),
+        }
+    }
+
+    pub fn version_label(&self) -> &'static str {
+        match self {
+            Self::V2(_) => "v2",
+            Self::V3(_) => "v3",
+        }
+    }
+
+    /// URL with any trailing slash stripped — used as the base for all 
endpoint URLs.
+    pub(crate) fn base_url(&self) -> &str {
+        self.url().trim_end_matches('/')
+    }
+}
+
+// ── Row processing context 
────────────────────────────────────────────────────
+
+/// Per-poll fields that are constant across all rows in a batch.
+/// Passed by reference to `process_rows` so the function signature stays at ≤ 
3 parameters.
+#[derive(Debug, Clone, Copy)]
+pub(crate) struct RowContext<'a> {
+    pub cursor_field: &'a str,
+    pub current_cursor: &'a str,
+    pub include_metadata: bool,
+    pub payload_col: Option<&'a str>,
+    pub payload_format: PayloadFormat,
+    pub now_micros: u64,
+}
+
+// ── Persisted state 
───────────────────────────────────────────────────────────
+
+#[derive(Debug, Serialize, Deserialize)]
+#[serde(tag = "version")]
+pub enum PersistedState {
+    #[serde(rename = "v2")]
+    V2(V2State),
+    #[serde(rename = "v3")]
+    V3(V3State),
+}
+
+#[derive(Debug, Default, Clone, Serialize, Deserialize)]
+pub struct V2State {
+    pub last_timestamp: Option<String>,
+    pub processed_rows: u64,
+    /// Rows at `last_timestamp` already delivered; used to skip them when the
+    /// Flux query uses `>= $cursor` and a batch boundary lands mid-timestamp.
+    pub cursor_row_count: u64,
+}
+
+#[derive(Debug, Default, Clone, Serialize, Deserialize)]
+pub struct V3State {
+    pub last_timestamp: Option<String>,
+    pub processed_rows: u64,
+    /// Current effective batch size after stuck-timestamp inflation.
+    /// Reset to the configured base value when the cursor advances.
+    pub effective_batch_size: u32,
+}
+
+// ── Payload format 
────────────────────────────────────────────────────────────
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
+pub enum PayloadFormat {
+    #[default]
+    Json,
+    Text,
+    Raw,
+}
+
+impl PayloadFormat {
+    pub fn from_config(value: Option<&str>) -> Self {
+        match value.map(|v| v.to_ascii_lowercase()).as_deref() {
+            Some("text") | Some("utf8") => PayloadFormat::Text,
+            Some("raw") | Some("base64") => PayloadFormat::Raw,
+            Some("json") => PayloadFormat::Json,
+            other => {
+                if other.is_some() {
+                    warn!(
+                        "Unrecognized payload_format {:?}, falling back to 
JSON",
+                        other
+                    );
+                }
+                PayloadFormat::Json
+            }
+        }
+    }
+
+    pub fn schema(self) -> Schema {
+        match self {
+            PayloadFormat::Json => Schema::Json,
+            PayloadFormat::Text => Schema::Text,
+            PayloadFormat::Raw => Schema::Raw,
+        }
+    }
+}
+
+// ── Cursor validation 
─────────────────────────────────────────────────────────
+
+static CURSOR_RE: OnceLock<regex::Regex> = OnceLock::new();
+
+pub fn cursor_re() -> &'static regex::Regex {
+    CURSOR_RE.get_or_init(|| {
+        // Validates RFC 3339 timestamp structure with proper field ranges:
+        // month 01-12, day 01-31, hour 00-23, minute/second 00-59.
+        // Timezone suffix is required: a naive timestamp without Z or +HH:MM
+        // is rejected to prevent silent UTC-vs-local ambiguity between V2 
(Flux
+        // always treats timestamps as UTC) and V3 (SQL engine timezone depends
+        // on server config).
+        // Note: day 29-31 validity for a given month is not checked by the 
regex;
+        // chrono parsing inside validate_cursor handles that for tz-aware 
timestamps.
+        regex::Regex::new(
+            
r"^\d{4}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])T([01]\d|2[0-3]):[0-5]\d:[0-5]\d(\.\d+)?(Z|[+-]\d{2}:\d{2})$"
+        )
+        .expect("hardcoded regex is valid")
+    })
+}
+
+pub fn validate_cursor(cursor: &str) -> Result<(), Error> {
+    if cursor_re().is_match(cursor) {
+        Ok(())
+    } else {
+        Err(Error::InvalidConfigValue(format!(
+            "cursor value {cursor:?} is not a valid RFC 3339 timestamp; \
+             refusing substitution to prevent query injection"
+        )))
+    }
+}
+
+/// Validate `cursor_field` for the given connector version.
+///
+/// `version` should be `"v2"` or `"v3"`. The function is version-strict: 
`"_time"`
+/// is only valid for V2 (Flux annotation column) and `"time"` is only valid 
for V3
+/// (SQL timestamp column). Swapping them silently would produce empty result 
sets
+/// or query errors at the InfluxDB level.
+pub fn validate_cursor_field(field: &str, version: &str) -> Result<(), Error> {

Review Comment:
   fixed



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to