ryerraguntla commented on code in PR #2933:
URL: https://github.com/apache/iggy/pull/2933#discussion_r3009383845


##########
core/connectors/sources/influxdb_source/src/lib.rs:
##########
@@ -0,0 +1,1241 @@
+/* Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+use async_trait::async_trait;
+use base64::{Engine as _, engine::general_purpose};
+use csv::StringRecord;
+use iggy_common::serde_secret::serialize_secret;
+use iggy_common::{DateTime, Utc};
+use iggy_connector_sdk::retry::{
+    CircuitBreaker, ConnectivityConfig, build_retry_client, 
check_connectivity_with_retry,
+    parse_duration,
+};
+use iggy_connector_sdk::{
+    ConnectorState, Error, ProducedMessage, ProducedMessages, Schema, Source, 
source_connector,
+};
+use regex::Regex;
+use reqwest::Url;
+use reqwest_middleware::ClientWithMiddleware;
+use secrecy::{ExposeSecret, SecretString};
+use serde::{Deserialize, Serialize};
+use serde_json::json;
+use std::collections::HashMap;
+use std::sync::Arc;
+use std::sync::OnceLock;
+use std::time::Duration;
+use tokio::sync::Mutex;
+use tracing::{debug, error, info, warn};
+use uuid::Uuid;
+
+source_connector!(InfluxDbSource);
+
+const CONNECTOR_NAME: &str = "InfluxDB source";
+const DEFAULT_MAX_RETRIES: u32 = 3;
+const DEFAULT_RETRY_DELAY: &str = "1s";
+const DEFAULT_POLL_INTERVAL: &str = "5s";
+const DEFAULT_TIMEOUT: &str = "10s";
+const DEFAULT_CURSOR: &str = "1970-01-01T00:00:00Z";
+// Maximum attempts for open() connectivity retries
+const DEFAULT_MAX_OPEN_RETRIES: u32 = 10;
+// Cap for exponential backoff in open() — never wait longer than this
+const DEFAULT_OPEN_RETRY_MAX_DELAY: &str = "60s";
+// Cap for exponential backoff on per-query retries — kept short so a
+// transient InfluxDB blip does not stall polling for too long
+const DEFAULT_RETRY_MAX_DELAY: &str = "5s";
+// How many consecutive poll failures open the circuit breaker
+const DEFAULT_CIRCUIT_BREAKER_THRESHOLD: u32 = 5;
+// How long the circuit stays open before allowing a probe attempt
+const DEFAULT_CIRCUIT_COOL_DOWN: &str = "30s";
+
+/// RFC 3339 / ISO 8601 datetime pattern.
+/// Matches the forms InfluxDB stores in `_time`:
+///   "2024-01-15T10:30:00Z"
+///   "2024-01-15T10:30:00.123456789Z"
+///   "2024-01-15T10:30:00+05:30"
+/// Intentionally strict: only digits, T, Z, colon, dot, plus, hyphen.
+/// Any Flux syntax character (pipe, quote, paren, space, slash) is rejected.
+static CURSOR_RE: OnceLock<Regex> = OnceLock::new();
+
+// ---------------------------------------------------------------------------
+// Main connector structs
+// ---------------------------------------------------------------------------
+
+#[derive(Debug)]
+pub struct InfluxDbSource {
+    pub id: u32,
+    config: InfluxDbSourceConfig,
+    /// `None` until `open()` is called. Wraps `reqwest::Client` with
+    /// [`HttpRetryMiddleware`] so retry/back-off/jitter is handled
+    /// transparently by the middleware stack instead of a hand-rolled loop.
+    client: Option<ClientWithMiddleware>,
+    state: Mutex<State>,
+    verbose: bool,
+    retry_delay: Duration,
+    poll_interval: Duration,
+    /// Resolved once in `new()` — avoids a `to_ascii_lowercase()` allocation
+    /// on every message in the hot path.
+    payload_format: PayloadFormat,
+    circuit_breaker: Arc<CircuitBreaker>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct InfluxDbSourceConfig {
+    pub url: String,
+    pub org: String,
+    #[serde(serialize_with = "serialize_secret")]
+    pub token: SecretString,
+    pub query: String,
+    pub poll_interval: Option<String>,
+    pub batch_size: Option<u32>,
+    pub cursor_field: Option<String>,
+    pub initial_offset: Option<String>,
+    pub payload_column: Option<String>,
+    pub payload_format: Option<String>,
+    pub include_metadata: Option<bool>,
+    pub verbose_logging: Option<bool>,
+    pub max_retries: Option<u32>,
+    pub retry_delay: Option<String>,
+    pub timeout: Option<String>,
+    // How many times open() will retry before giving up
+    pub max_open_retries: Option<u32>,
+    // Upper cap on open() backoff delay — can be set high (e.g. "60s") for
+    // patient startup without affecting per-query retry behaviour
+    pub open_retry_max_delay: Option<String>,
+    // Upper cap on per-query retry backoff — kept short so a transient blip
+    // does not stall polling; independent of open_retry_max_delay
+    pub retry_max_delay: Option<String>,
+    // Circuit breaker configuration
+    pub circuit_breaker_threshold: Option<u32>,
+    pub circuit_breaker_cool_down: Option<String>,
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
+enum PayloadFormat {
+    #[default]
+    Json,
+    Text,
+    Raw,
+}
+
+impl PayloadFormat {
+    fn from_config(value: Option<&str>) -> Self {
+        match value.map(|v| v.to_ascii_lowercase()).as_deref() {
+            Some("text") | Some("utf8") => PayloadFormat::Text,
+            Some("raw") | Some("base64") => PayloadFormat::Raw,
+            Some("json") => PayloadFormat::Json,
+            other => {
+                warn!(
+                    "Unrecognized payload_format value {:?}, falling back to 
JSON. \
+                     Valid values are: \"json\", \"text\", \"utf8\", 
\"base64\", \"raw\".",
+                    other
+                );
+                PayloadFormat::Json
+            }
+        }
+    }
+
+    fn schema(self) -> Schema {
+        match self {
+            PayloadFormat::Json => Schema::Json,
+            PayloadFormat::Text => Schema::Text,
+            PayloadFormat::Raw => Schema::Raw,
+        }
+    }
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+struct State {
+    last_poll_time: DateTime<Utc>,
+    last_timestamp: Option<String>,
+    processed_rows: u64,
+    /// How many rows at `last_timestamp` have already been delivered 
downstream.
+    ///
+    /// When the user's Flux query uses `>= $cursor`, consecutive polls may
+    /// return the same rows for the current cursor timestamp.  This counter
+    /// lets `poll_messages` skip those already-delivered rows and inflate
+    /// `$limit` accordingly, preventing both duplicates and data loss at
+    /// batch boundaries where multiple rows share the same timestamp.
+    ///
+    /// `#[serde(default)]` keeps existing persisted state files 
forward-compatible:
+    /// the field defaults to 0 when the state was saved by an older version.
+    #[serde(default)]
+    cursor_row_count: u64,
+}
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+fn parse_scalar(value: &str) -> serde_json::Value {
+    if value.is_empty() {
+        return serde_json::Value::Null;
+    }
+    if let Ok(v) = value.parse::<bool>() {
+        return serde_json::Value::Bool(v);
+    }
+    if let Ok(v) = value.parse::<i64>() {
+        return serde_json::Value::Number(v.into());
+    }
+    if let Ok(v) = value.parse::<f64>()
+        && let Some(number) = serde_json::Number::from_f64(v)
+    {
+        return serde_json::Value::Number(number);
+    }
+    serde_json::Value::String(value.to_string())
+}
+
+/// Recognise an InfluxDB CSV header row.
+///
+/// A header row must contain a `_time` column. The `_value` column is
+/// intentionally **not** required: Flux aggregation queries (`count()`,
+/// `mean()`, `group()`) produce result tables with columns like `_count` or
+/// `_mean` instead of `_value`. Requiring `_value` would cause those header
+/// rows to be missed, silently skipping all subsequent data rows until the
+/// next recognised header.
+///
+/// InfluxDB annotation rows (`#group`, `#datatype`, `#default`) are already
+/// filtered out earlier in `parse_csv_rows` by the leading-`#` check, so
+/// they will never reach this function.
+fn is_header_record(record: &StringRecord) -> bool {
+    record.iter().any(|v| v == "_time")
+}
+
+/// Compare two RFC 3339 timestamp strings chronologically.
+///
+/// InfluxDB strips trailing fractional-second zeros, producing timestamps like
+/// `"2026-03-18T12:00:00.60952Z"` (= 609520µs).  A naïve `>` string comparison
+/// treats this as *greater* than `"2026-03-18T12:00:00.609521Z"` because `'Z'`
+/// (ASCII 90) > `'1'` (ASCII 49), even though the former is chronologically
+/// *earlier*.  Always parse to `DateTime<Utc>` so the comparison is correct.
+fn is_timestamp_after(a: &str, b: &str) -> bool {
+    match (a.parse::<DateTime<Utc>>(), b.parse::<DateTime<Utc>>()) {
+        (Ok(dt_a), Ok(dt_b)) => dt_a > dt_b,
+        _ => a > b,
+    }
+}
+
+// ---------------------------------------------------------------------------
+// InfluxDbSource implementation
+// ---------------------------------------------------------------------------
+
+impl InfluxDbSource {
+    pub fn new(id: u32, config: InfluxDbSourceConfig, state: 
Option<ConnectorState>) -> Self {
+        let verbose = config.verbose_logging.unwrap_or(false);
+        let retry_delay = parse_duration(config.retry_delay.as_deref(), 
DEFAULT_RETRY_DELAY);
+        let poll_interval = parse_duration(config.poll_interval.as_deref(), 
DEFAULT_POLL_INTERVAL);
+        let payload_format = 
PayloadFormat::from_config(config.payload_format.as_deref());
+
+        // Build circuit breaker from config
+        let cb_threshold = config
+            .circuit_breaker_threshold
+            .unwrap_or(DEFAULT_CIRCUIT_BREAKER_THRESHOLD);
+        let cb_cool_down = parse_duration(
+            config.circuit_breaker_cool_down.as_deref(),
+            DEFAULT_CIRCUIT_COOL_DOWN,
+        );
+
+        let restored_state = state

Review Comment:
   Done.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to