hubcio commented on code in PR #2933: URL: https://github.com/apache/iggy/pull/2933#discussion_r3008198385
########## core/connectors/sources/influxdb_source/src/lib.rs: ########## @@ -0,0 +1,1241 @@ +/* Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +use async_trait::async_trait; +use base64::{Engine as _, engine::general_purpose}; +use csv::StringRecord; +use iggy_common::serde_secret::serialize_secret; +use iggy_common::{DateTime, Utc}; +use iggy_connector_sdk::retry::{ + CircuitBreaker, ConnectivityConfig, build_retry_client, check_connectivity_with_retry, + parse_duration, +}; +use iggy_connector_sdk::{ + ConnectorState, Error, ProducedMessage, ProducedMessages, Schema, Source, source_connector, +}; +use regex::Regex; +use reqwest::Url; +use reqwest_middleware::ClientWithMiddleware; +use secrecy::{ExposeSecret, SecretString}; +use serde::{Deserialize, Serialize}; +use serde_json::json; +use std::collections::HashMap; +use std::sync::Arc; +use std::sync::OnceLock; +use std::time::Duration; +use tokio::sync::Mutex; +use tracing::{debug, error, info, warn}; +use uuid::Uuid; + +source_connector!(InfluxDbSource); + +const CONNECTOR_NAME: &str = "InfluxDB source"; +const DEFAULT_MAX_RETRIES: u32 = 3; +const DEFAULT_RETRY_DELAY: &str = "1s"; +const DEFAULT_POLL_INTERVAL: &str = "5s"; +const DEFAULT_TIMEOUT: &str = "10s"; +const DEFAULT_CURSOR: &str = "1970-01-01T00:00:00Z"; +// Maximum attempts for open() connectivity retries +const DEFAULT_MAX_OPEN_RETRIES: u32 = 10; +// Cap for exponential backoff in open() — never wait longer than this +const DEFAULT_OPEN_RETRY_MAX_DELAY: &str = "60s"; +// Cap for exponential backoff on per-query retries — kept short so a +// transient InfluxDB blip does not stall polling for too long +const DEFAULT_RETRY_MAX_DELAY: &str = "5s"; +// How many consecutive poll failures open the circuit breaker +const DEFAULT_CIRCUIT_BREAKER_THRESHOLD: u32 = 5; +// How long the circuit stays open before allowing a probe attempt +const DEFAULT_CIRCUIT_COOL_DOWN: &str = "30s"; + +/// RFC 3339 / ISO 8601 datetime pattern. +/// Matches the forms InfluxDB stores in `_time`: +/// "2024-01-15T10:30:00Z" +/// "2024-01-15T10:30:00.123456789Z" +/// "2024-01-15T10:30:00+05:30" +/// Intentionally strict: only digits, T, Z, colon, dot, plus, hyphen. +/// Any Flux syntax character (pipe, quote, paren, space, slash) is rejected. +static CURSOR_RE: OnceLock<Regex> = OnceLock::new(); + +// --------------------------------------------------------------------------- +// Main connector structs +// --------------------------------------------------------------------------- + +#[derive(Debug)] +pub struct InfluxDbSource { + pub id: u32, + config: InfluxDbSourceConfig, + /// `None` until `open()` is called. Wraps `reqwest::Client` with + /// [`HttpRetryMiddleware`] so retry/back-off/jitter is handled + /// transparently by the middleware stack instead of a hand-rolled loop. + client: Option<ClientWithMiddleware>, + state: Mutex<State>, + verbose: bool, + retry_delay: Duration, + poll_interval: Duration, + /// Resolved once in `new()` — avoids a `to_ascii_lowercase()` allocation + /// on every message in the hot path. + payload_format: PayloadFormat, + circuit_breaker: Arc<CircuitBreaker>, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct InfluxDbSourceConfig { + pub url: String, + pub org: String, + #[serde(serialize_with = "serialize_secret")] + pub token: SecretString, + pub query: String, + pub poll_interval: Option<String>, + pub batch_size: Option<u32>, + pub cursor_field: Option<String>, + pub initial_offset: Option<String>, + pub payload_column: Option<String>, + pub payload_format: Option<String>, + pub include_metadata: Option<bool>, + pub verbose_logging: Option<bool>, + pub max_retries: Option<u32>, + pub retry_delay: Option<String>, + pub timeout: Option<String>, + // How many times open() will retry before giving up + pub max_open_retries: Option<u32>, + // Upper cap on open() backoff delay — can be set high (e.g. "60s") for + // patient startup without affecting per-query retry behaviour + pub open_retry_max_delay: Option<String>, + // Upper cap on per-query retry backoff — kept short so a transient blip + // does not stall polling; independent of open_retry_max_delay + pub retry_max_delay: Option<String>, + // Circuit breaker configuration + pub circuit_breaker_threshold: Option<u32>, + pub circuit_breaker_cool_down: Option<String>, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +enum PayloadFormat { + #[default] + Json, + Text, + Raw, +} + +impl PayloadFormat { + fn from_config(value: Option<&str>) -> Self { + match value.map(|v| v.to_ascii_lowercase()).as_deref() { + Some("text") | Some("utf8") => PayloadFormat::Text, + Some("raw") | Some("base64") => PayloadFormat::Raw, + Some("json") => PayloadFormat::Json, + other => { + warn!( + "Unrecognized payload_format value {:?}, falling back to JSON. \ + Valid values are: \"json\", \"text\", \"utf8\", \"base64\", \"raw\".", + other + ); + PayloadFormat::Json + } + } + } + + fn schema(self) -> Schema { + match self { + PayloadFormat::Json => Schema::Json, + PayloadFormat::Text => Schema::Text, + PayloadFormat::Raw => Schema::Raw, + } + } +} + +#[derive(Debug, Serialize, Deserialize)] +struct State { + last_poll_time: DateTime<Utc>, + last_timestamp: Option<String>, + processed_rows: u64, + /// How many rows at `last_timestamp` have already been delivered downstream. + /// + /// When the user's Flux query uses `>= $cursor`, consecutive polls may + /// return the same rows for the current cursor timestamp. This counter + /// lets `poll_messages` skip those already-delivered rows and inflate + /// `$limit` accordingly, preventing both duplicates and data loss at + /// batch boundaries where multiple rows share the same timestamp. + /// + /// `#[serde(default)]` keeps existing persisted state files forward-compatible: + /// the field defaults to 0 when the state was saved by an older version. + #[serde(default)] + cursor_row_count: u64, +} + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +fn parse_scalar(value: &str) -> serde_json::Value { + if value.is_empty() { + return serde_json::Value::Null; + } + if let Ok(v) = value.parse::<bool>() { + return serde_json::Value::Bool(v); + } + if let Ok(v) = value.parse::<i64>() { + return serde_json::Value::Number(v.into()); + } + if let Ok(v) = value.parse::<f64>() + && let Some(number) = serde_json::Number::from_f64(v) + { + return serde_json::Value::Number(number); + } + serde_json::Value::String(value.to_string()) +} + +/// Recognise an InfluxDB CSV header row. +/// +/// A header row must contain a `_time` column. The `_value` column is +/// intentionally **not** required: Flux aggregation queries (`count()`, +/// `mean()`, `group()`) produce result tables with columns like `_count` or +/// `_mean` instead of `_value`. Requiring `_value` would cause those header +/// rows to be missed, silently skipping all subsequent data rows until the +/// next recognised header. +/// +/// InfluxDB annotation rows (`#group`, `#datatype`, `#default`) are already +/// filtered out earlier in `parse_csv_rows` by the leading-`#` check, so +/// they will never reach this function. +fn is_header_record(record: &StringRecord) -> bool { + record.iter().any(|v| v == "_time") +} + +/// Compare two RFC 3339 timestamp strings chronologically. +/// +/// InfluxDB strips trailing fractional-second zeros, producing timestamps like +/// `"2026-03-18T12:00:00.60952Z"` (= 609520µs). A naïve `>` string comparison +/// treats this as *greater* than `"2026-03-18T12:00:00.609521Z"` because `'Z'` +/// (ASCII 90) > `'1'` (ASCII 49), even though the former is chronologically +/// *earlier*. Always parse to `DateTime<Utc>` so the comparison is correct. +fn is_timestamp_after(a: &str, b: &str) -> bool { + match (a.parse::<DateTime<Utc>>(), b.parse::<DateTime<Utc>>()) { + (Ok(dt_a), Ok(dt_b)) => dt_a > dt_b, + _ => a > b, + } +} + +// --------------------------------------------------------------------------- +// InfluxDbSource implementation +// --------------------------------------------------------------------------- + +impl InfluxDbSource { + pub fn new(id: u32, config: InfluxDbSourceConfig, state: Option<ConnectorState>) -> Self { + let verbose = config.verbose_logging.unwrap_or(false); + let retry_delay = parse_duration(config.retry_delay.as_deref(), DEFAULT_RETRY_DELAY); + let poll_interval = parse_duration(config.poll_interval.as_deref(), DEFAULT_POLL_INTERVAL); + let payload_format = PayloadFormat::from_config(config.payload_format.as_deref()); + + // Build circuit breaker from config + let cb_threshold = config + .circuit_breaker_threshold + .unwrap_or(DEFAULT_CIRCUIT_BREAKER_THRESHOLD); + let cb_cool_down = parse_duration( + config.circuit_breaker_cool_down.as_deref(), + DEFAULT_CIRCUIT_COOL_DOWN, + ); + + let restored_state = state + .and_then(|s| s.deserialize::<State>(CONNECTOR_NAME, id)) + .inspect(|s| { + info!( + "Restored state for {CONNECTOR_NAME} connector with ID: {id}. \ + Last timestamp: {:?}, processed rows: {}", + s.last_timestamp, s.processed_rows + ); + }); + + InfluxDbSource { + id, + config, + client: None, + state: Mutex::new(restored_state.unwrap_or(State { + last_poll_time: Utc::now(), + last_timestamp: None, + processed_rows: 0, + cursor_row_count: 0, + })), + verbose, + retry_delay, + poll_interval, + payload_format, + circuit_breaker: Arc::new(CircuitBreaker::new(cb_threshold, cb_cool_down)), + } + } + + fn serialize_state(&self, state: &State) -> Option<ConnectorState> { + ConnectorState::serialize(state, CONNECTOR_NAME, self.id) + } + + fn payload_format(&self) -> PayloadFormat { + self.payload_format + } + + fn cursor_field(&self) -> &str { + self.config.cursor_field.as_deref().unwrap_or("_time") + } + + fn get_max_retries(&self) -> u32 { + self.config + .max_retries + .unwrap_or(DEFAULT_MAX_RETRIES) + .max(1) + } + + fn build_raw_client(&self) -> Result<reqwest::Client, Error> { + let timeout = parse_duration(self.config.timeout.as_deref(), DEFAULT_TIMEOUT); + reqwest::Client::builder() + .timeout(timeout) + .build() + .map_err(|e| Error::InitError(format!("Failed to create HTTP client: {e}"))) + } + + fn get_client(&self) -> Result<&ClientWithMiddleware, Error> { + self.client + .as_ref() + .ok_or_else(|| Error::Connection("InfluxDB client is not initialized".to_string())) + } + + fn build_health_url(&self) -> Result<Url, Error> { + let base = self.config.url.trim_end_matches('/'); + Url::parse(&format!("{base}/health")) + .map_err(|e| Error::InvalidConfigValue(format!("Invalid InfluxDB URL: {e}"))) + } + + fn build_query_url(&self) -> Result<Url, Error> { + let base = self.config.url.trim_end_matches('/'); + let mut url = Url::parse(&format!("{base}/api/v2/query")) + .map_err(|e| Error::InvalidConfigValue(format!("Invalid InfluxDB URL: {e}")))?; + url.query_pairs_mut().append_pair("org", &self.config.org); + Ok(url) + } + + fn cursor_re() -> &'static Regex { + CURSOR_RE.get_or_init(|| { + Regex::new(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d+)?(Z|[+-]\d{2}:\d{2})$") + .expect("hardcoded regex is valid") + }) + } + + fn validate_cursor(cursor: &str) -> Result<(), Error> { + if Self::cursor_re().is_match(cursor) { + Ok(()) + } else { + Err(Error::InvalidConfigValue(format!( + "cursor value {:?} is not a valid RFC 3339 timestamp; \ + refusing substitution to prevent Flux query injection", + cursor + ))) + } + } + + /// Reject cursor fields that would produce incorrect results. + /// + /// Cursor advancement compares values as `String`s (lexicographic order). + /// This is correct for ISO 8601 / RFC 3339 timestamps — the default + /// `cursor_field` of `"_time"` — because their fixed-width format makes + /// lexicographic and chronological order identical. + fn validate_cursor_field(field: &str) -> Result<(), Error> { + match field { + "_time" | "time" => Ok(()), + other => Err(Error::InvalidConfigValue(format!( + "cursor_field {:?} is not supported — cursor values are compared as strings \ + (lexicographic order), which is only correct for ISO 8601 timestamp columns. \ + Use the default \"_time\" column, or omit cursor_field entirely.", + other + ))), + } + } + + fn query_with_params(&self, cursor: &str, already_seen: u64) -> Result<String, Error> { + // Reject anything that is not a well-formed RFC 3339 timestamp. + // This prevents a crafted or corrupted _time value (e.g. containing + // Flux syntax like `") |> drop() //`) from being injected into the + // query string before it is sent to /api/v2/query. + // Note: InfluxDB OSS v2 does not support the `params` JSON field for + // parameterized queries (Cloud-only feature), so substitution is + // unavoidable for OSS — validation is the correct mitigation here. + Self::validate_cursor(cursor)?; + // Inflate the limit so that after skipping `already_seen` rows at the + // cursor timestamp we still return a full batch of new rows. This is + // a no-op when `already_seen == 0` (first poll or `>` queries). + let batch_size = self.config.batch_size.unwrap_or(500) as u64; + let limit = batch_size.saturating_add(already_seen).to_string(); + let mut query = self.config.query.clone(); + if query.contains("$cursor") { + query = query.replace("$cursor", cursor); + } + if query.contains("$limit") { + query = query.replace("$limit", &limit); + } + Ok(query) + } + + /// Execute a Flux query against `/api/v2/query` and return the raw CSV + /// response body. Retry/back-off is handled transparently by the + /// `ClientWithMiddleware` stack (see `build_retry_client`). + async fn run_query(&self, query: &str) -> Result<String, Error> { + let client = self.get_client()?; + let url = self.build_query_url()?; + let token = self.config.token.expose_secret().to_owned(); + + let body = json!({ + "query": query, + "dialect": { + "annotations": [], + "delimiter": ",", + "header": true, + "commentPrefix": "#" + } + }); + + let response = client + .post(url) + .header("Authorization", format!("Token {token}")) + .header("Content-Type", "application/json") + .header("Accept", "text/csv") + .json(&body) + .send() + .await + .map_err(|e| Error::Storage(format!("InfluxDB query failed: {e}")))?; + + let status = response.status(); + if status.is_success() { + return response + .text() + .await + .map_err(|e| Error::Storage(format!("Failed to read query response: {e}"))); + } + + let body_text = response + .text() + .await + .unwrap_or_else(|_| "failed to read response body".to_string()); + + // Use PermanentHttpError for non-transient 4xx (400 Bad Request, 401 + // Unauthorized, etc.) so poll() can skip the circuit breaker for these + // — they indicate a config/data issue, not an infrastructure failure. + if iggy_connector_sdk::retry::is_transient_status(status) { + Err(Error::Storage(format!( + "InfluxDB query failed with status {status}: {body_text}" + ))) + } else { + Err(Error::PermanentHttpError(format!( + "InfluxDB query failed with status {status}: {body_text}" + ))) + } + } + + fn parse_csv_rows(&self, csv_text: &str) -> Result<Vec<HashMap<String, String>>, Error> { + let mut reader = csv::ReaderBuilder::new() + .has_headers(false) + .from_reader(csv_text.as_bytes()); + + let mut headers: Option<StringRecord> = None; + let mut rows = Vec::new(); + + for result in reader.records() { + let record = result + .map_err(|e| Error::InvalidRecordValue(format!("Invalid CSV record: {e}")))?; + + if record.is_empty() { + continue; + } + + if let Some(first) = record.get(0) + && first.starts_with('#') + { + continue; + } + + if is_header_record(&record) { + headers = Some(record.clone()); + continue; + } + + let Some(active_headers) = headers.as_ref() else { + continue; + }; + + if record == *active_headers { + continue; + } + + let mut mapped = HashMap::new(); + for (idx, key) in active_headers.iter().enumerate() { + if key.is_empty() { + continue; + } + let value = record.get(idx).unwrap_or("").to_string(); + mapped.insert(key.to_string(), value); + } + + if !mapped.is_empty() { + rows.push(mapped); + } + } + + Ok(rows) + } + + fn build_payload( + &self, + row: &HashMap<String, String>, + include_metadata: bool, + ) -> Result<Vec<u8>, Error> { + if let Some(payload_column) = self.config.payload_column.as_deref() { + let raw_value = row.get(payload_column).cloned().ok_or_else(|| { + Error::InvalidRecordValue(format!("Missing payload column '{payload_column}'")) + })?; + + return match self.payload_format() { + PayloadFormat::Json => { + let value: serde_json::Value = + serde_json::from_str(&raw_value).map_err(|e| { + Error::InvalidRecordValue(format!( + "Payload column '{payload_column}' is not valid JSON: {e}" + )) + })?; + serde_json::to_vec(&value).map_err(|e| { + Error::Serialization(format!("JSON serialization failed: {e}")) + }) + } + PayloadFormat::Text => Ok(raw_value.into_bytes()), + PayloadFormat::Raw => general_purpose::STANDARD + .decode(raw_value.as_bytes()) + .map_err(|e| { + Error::InvalidRecordValue(format!( + "Failed to decode payload as base64: {e}" + )) + }), + }; + } + + let mut json_row = serde_json::Map::new(); + for (key, value) in row { + if include_metadata || key == "_value" || key == "_time" || key == "_measurement" { + json_row.insert(key.clone(), parse_scalar(value)); + } + } + + let wrapped = json!({ + "measurement": row.get("_measurement").cloned().unwrap_or_default(), + "field": row.get("_field").cloned().unwrap_or_default(), + "timestamp": row.get("_time").cloned().unwrap_or_default(), + "value": row.get("_value").map(|v| parse_scalar(v)).unwrap_or(serde_json::Value::Null), + "row": json_row, + }); + + serde_json::to_vec(&wrapped) + .map_err(|e| Error::Serialization(format!("JSON serialization failed: {e}"))) + } + + /// Returns `(messages, max_cursor, rows_at_max_cursor)`. + /// + /// `rows_at_max_cursor` is the count of delivered messages whose cursor + /// field value equals `max_cursor`. The caller stores this in + /// [`State::cursor_row_count`] so the next poll can skip those rows when + /// the query uses `>= $cursor`. + async fn poll_messages(&self) -> Result<(Vec<ProducedMessage>, Option<String>, u64), Error> { + // Read cursor and already_seen atomically from the same lock acquisition + // so the two values are always consistent with each other. + let (cursor, already_seen) = { + let state = self.state.lock().await; + let c = state + .last_timestamp + .clone() + .or_else(|| self.config.initial_offset.clone()) + .unwrap_or_else(|| DEFAULT_CURSOR.to_string()); + (c, state.cursor_row_count) + }; + + let query = self.query_with_params(&cursor, already_seen).map_err(|e| { + error!( + "InfluxDB source ID: {} — invalid cursor, skipping poll: {e}", + self.id + ); + e + })?; + let csv_data = self.run_query(&query).await?; + + let rows = self.parse_csv_rows(&csv_data)?; + let include_metadata = self.config.include_metadata.unwrap_or(true); + let cursor_field = self.cursor_field().to_string(); + + let mut messages = Vec::with_capacity(rows.len()); + let mut max_cursor: Option<String> = None; + let mut rows_at_max_cursor = 0u64; + let mut skipped = 0u64; + + for row in rows { + // Skip rows at the current cursor that were already delivered in a + // previous batch. This deduplicate rows when the query uses + // `>= $cursor` and a batch boundary landed inside a group of rows + // sharing the same timestamp. + if let Some(cv) = row.get(&cursor_field) + && cv == &cursor + && skipped < already_seen + { + skipped += 1; + continue; + } + + // Track the new max cursor and how many delivered rows share it. + if let Some(cv) = row.get(&cursor_field) { + match &max_cursor { + None => { + max_cursor = Some(cv.clone()); + rows_at_max_cursor = 1; + } + Some(current) => { + if is_timestamp_after(cv, current) { + max_cursor = Some(cv.clone()); + rows_at_max_cursor = 1; + } else if cv == current { + rows_at_max_cursor += 1; + } + } + } + } + + let payload = self.build_payload(&row, include_metadata)?; + // Capture once so timestamp and origin_timestamp are guaranteed identical + // and we make exactly one syscall regardless of how many fields use it. + let now_micros = Utc::now().timestamp_micros() as u64; + + messages.push(ProducedMessage { + id: Some(Uuid::new_v4().as_u128()), + checksum: None, + timestamp: Some(now_micros), + origin_timestamp: Some(now_micros), + headers: None, + payload, + }); + } + + Ok((messages, max_cursor, rows_at_max_cursor)) + } +} + +// --------------------------------------------------------------------------- +// Source trait implementation +// --------------------------------------------------------------------------- + +#[async_trait] +impl Source for InfluxDbSource { + async fn open(&mut self) -> Result<(), Error> { + info!( + "Opening InfluxDB source connector with ID: {}. Org: {}", + self.id, self.config.org + ); + + // Build the raw client first and use it for the startup connectivity + // check. The connectivity retry loop uses separate delay bounds + // (open_retry_max_delay) from the per-query middleware retries, so + // we keep them independent. + let raw_client = self.build_raw_client()?; + + // Validate cursor_field before touching the network: string comparison + // is only safe for timestamp columns. See validate_cursor_field for details. + Self::validate_cursor_field(self.cursor_field())?; + + let health_url = self.build_health_url()?; + check_connectivity_with_retry( + &raw_client, + health_url, + "InfluxDB source", + self.id, + &ConnectivityConfig { + max_open_retries: self + .config + .max_open_retries + .unwrap_or(DEFAULT_MAX_OPEN_RETRIES), + open_retry_max_delay: parse_duration( + self.config.open_retry_max_delay.as_deref(), + DEFAULT_OPEN_RETRY_MAX_DELAY, + ), + retry_delay: self.retry_delay, + }, + ) + .await?; + + // Wrap in the retry middleware for all subsequent query operations. + // The middleware handles transient 429 / 5xx retries with + // exponential back-off, jitter, and Retry-After header support. + let max_retries = self.get_max_retries(); + let query_retry_max_delay = parse_duration( + self.config.retry_max_delay.as_deref(), + DEFAULT_RETRY_MAX_DELAY, + ); + self.client = Some(build_retry_client( + raw_client, + max_retries, + self.retry_delay, + query_retry_max_delay, + "InfluxDB", + )); + + info!( + "InfluxDB source connector with ID: {} opened successfully", + self.id + ); + Ok(()) + } + + async fn poll(&self) -> Result<ProducedMessages, Error> { + // Skip query if circuit breaker is open; sleep so the runtime does not + // spin-call poll() in a hot loop while the circuit is held open. + if self.circuit_breaker.is_open().await { + warn!( + "InfluxDB source ID: {} — circuit breaker is OPEN. Skipping poll.", + self.id + ); + tokio::time::sleep(self.poll_interval).await; + return Ok(ProducedMessages { + schema: Schema::Json, + messages: vec![], + state: None, + }); + } + + match self.poll_messages().await { + Ok((messages, max_cursor, rows_at_max_cursor)) => { + // Successful poll — reset circuit breaker + self.circuit_breaker.record_success(); + + let mut state = self.state.lock().await; + state.last_poll_time = Utc::now(); + state.processed_rows += messages.len() as u64; + match max_cursor { + Some(ref new_cursor) + if state.last_timestamp.as_deref() != Some(new_cursor.as_str()) => + { + // Cursor advanced to a new timestamp — reset the row counter. + state.last_timestamp = max_cursor.clone(); + state.cursor_row_count = rows_at_max_cursor; + } + Some(_) => { + // Cursor stayed at the same timestamp — accumulate so the + // next poll skips all already-delivered rows at this timestamp. + state.cursor_row_count = Review Comment: `cursor_row_count` accumulates via `saturating_add` across polls but never resets when the cursor doesn't advance. if many rows share a timestamp across repeated polls, this counter grows unbounded. once it exceeds the actual row count at that timestamp, the skip logic at line 589 skips everything, producing 0 messages - connector gets permanently stuck. needs a cap (e.g. reset when poll produces 0 messages after skipping) or bound to batch_size. ########## core/connectors/sinks/influxdb_sink/src/lib.rs: ########## @@ -0,0 +1,1555 @@ +/* Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +use async_trait::async_trait; +use base64::{Engine as _, engine::general_purpose}; +use bytes::Bytes; +use iggy_common::serde_secret::serialize_secret; +use iggy_connector_sdk::retry::{ + CircuitBreaker, ConnectivityConfig, build_retry_client, check_connectivity_with_retry, + parse_duration, +}; +use iggy_connector_sdk::{ + ConsumedMessage, Error, MessagesMetadata, Sink, TopicMetadata, sink_connector, +}; +use reqwest::Url; +use reqwest_middleware::ClientWithMiddleware; +use secrecy::{ExposeSecret, SecretString}; +use serde::{Deserialize, Serialize}; +use std::sync::Arc; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::time::Duration; +use std::time::SystemTime; +use std::time::UNIX_EPOCH; +use tracing::{debug, error, info, warn}; +sink_connector!(InfluxDbSink); + +const DEFAULT_MAX_RETRIES: u32 = 3; +const DEFAULT_RETRY_DELAY: &str = "1s"; +const DEFAULT_TIMEOUT: &str = "30s"; +const DEFAULT_PRECISION: &str = "us"; +// Maximum attempts for open() connectivity retries +const DEFAULT_MAX_OPEN_RETRIES: u32 = 10; +// Cap for exponential backoff in open() — never wait longer than this +const DEFAULT_OPEN_RETRY_MAX_DELAY: &str = "60s"; +// Cap for exponential backoff on per-write retries — kept short so a +// transient InfluxDB blip does not stall message delivery for too long +const DEFAULT_RETRY_MAX_DELAY: &str = "5s"; +// How many consecutive batch failures open the circuit breaker +const DEFAULT_CIRCUIT_BREAKER_THRESHOLD: u32 = 5; +// How long the circuit stays open before allowing a probe attempt +const DEFAULT_CIRCUIT_COOL_DOWN: &str = "30s"; + +// --------------------------------------------------------------------------- +// Main connector structs +// --------------------------------------------------------------------------- + +#[derive(Debug)] +pub struct InfluxDbSink { + pub id: u32, + config: InfluxDbSinkConfig, + /// `None` until `open()` is called. Wraps `reqwest::Client` with + /// [`HttpRetryMiddleware`] so retry/back-off/jitter is handled + /// transparently by the middleware stack instead of a hand-rolled loop. + client: Option<ClientWithMiddleware>, + /// Cached once in `open()` — config fields never change at runtime. + write_url: Option<Url>, + messages_attempted: AtomicU64, + write_success: AtomicU64, + write_errors: AtomicU64, + verbose: bool, + retry_delay: Duration, + /// Resolved once in `new()` — avoids a `to_ascii_lowercase()` allocation + /// on every message in the hot path. + payload_format: PayloadFormat, + circuit_breaker: Arc<CircuitBreaker>, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct InfluxDbSinkConfig { + pub url: String, + pub org: String, + pub bucket: String, + #[serde(serialize_with = "serialize_secret")] + pub token: SecretString, + pub measurement: Option<String>, + pub precision: Option<String>, + pub batch_size: Option<u32>, + pub include_metadata: Option<bool>, + pub include_checksum: Option<bool>, + pub include_origin_timestamp: Option<bool>, + pub include_stream_tag: Option<bool>, + pub include_topic_tag: Option<bool>, + pub include_partition_tag: Option<bool>, + pub payload_format: Option<String>, + pub verbose_logging: Option<bool>, + pub max_retries: Option<u32>, + pub retry_delay: Option<String>, + pub timeout: Option<String>, + // How many times open() will retry before giving up + pub max_open_retries: Option<u32>, + // Upper cap on open() backoff delay — can be set high (e.g. "60s") for + // patient startup without affecting per-write retry behaviour + pub open_retry_max_delay: Option<String>, + // Upper cap on per-write retry backoff — kept short so a transient blip + // does not stall message delivery; independent of open_retry_max_delay + pub retry_max_delay: Option<String>, + // Circuit breaker configuration + pub circuit_breaker_threshold: Option<u32>, + pub circuit_breaker_cool_down: Option<String>, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +enum PayloadFormat { + #[default] + Json, + Text, + Base64, +} + +impl PayloadFormat { + fn from_config(value: Option<&str>) -> Self { + match value.map(|v| v.to_ascii_lowercase()).as_deref() { + Some("text") | Some("utf8") => PayloadFormat::Text, + Some("base64") | Some("raw") => PayloadFormat::Base64, + Some("json") => PayloadFormat::Json, + other => { + warn!( + "Unrecognized payload_format value {:?}, falling back to JSON. \ + Valid values are: \"json\", \"text\", \"utf8\", \"base64\", \"raw\".", + other + ); + PayloadFormat::Json + } + } + } +} + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +/// Write an escaped measurement name into `buf`. +/// Escapes: `\` → `\\`, `,` → `\,`, ` ` → `\ `, `\n` → `\\n`, `\r` → `\\r` +/// +/// Newline (`\n`) and carriage-return (`\r`) are the InfluxDB line-protocol +/// record delimiters; a literal newline inside a measurement name would split +/// the line and corrupt the batch. +fn write_measurement(buf: &mut String, value: &str) { + for ch in value.chars() { + match ch { + '\\' => buf.push_str("\\\\"), + ',' => buf.push_str("\\,"), + ' ' => buf.push_str("\\ "), + '\n' => buf.push_str("\\n"), + '\r' => buf.push_str("\\r"), + _ => buf.push(ch), + } + } +} + +/// Write an escaped tag key/value into `buf`. +/// Escapes: `\` → `\\`, `,` → `\,`, `=` → `\=`, ` ` → `\ `, `\n` → `\\n`, `\r` → `\\r` +/// +/// Newline and carriage-return are escaped for the same reason as in +/// [`write_measurement`]: they are InfluxDB line-protocol record delimiters. +fn write_tag_value(buf: &mut String, value: &str) { + for ch in value.chars() { + match ch { + '\\' => buf.push_str("\\\\"), + ',' => buf.push_str("\\,"), + '=' => buf.push_str("\\="), + ' ' => buf.push_str("\\ "), + '\n' => buf.push_str("\\n"), + '\r' => buf.push_str("\\r"), + _ => buf.push(ch), + } + } +} + +/// Write an escaped string field value (without surrounding quotes) into `buf`. +/// Escapes: `\` → `\\`, `"` → `\"`, `\n` → `\\n`, `\r` → `\\r` +/// +/// Newline and carriage-return are the InfluxDB line-protocol record +/// delimiters; a literal newline inside a string field value (e.g. from a +/// multi-line text payload) would split the line and corrupt the batch. +fn write_field_string(buf: &mut String, value: &str) { + for ch in value.chars() { + match ch { + '\\' => buf.push_str("\\\\"), + '"' => buf.push_str("\\\""), + '\n' => buf.push_str("\\n"), + '\r' => buf.push_str("\\r"), + _ => buf.push(ch), + } + } +} + +// --------------------------------------------------------------------------- +// InfluxDbSink implementation +// --------------------------------------------------------------------------- + +impl InfluxDbSink { + pub fn new(id: u32, config: InfluxDbSinkConfig) -> Self { + let verbose = config.verbose_logging.unwrap_or(false); + let retry_delay = parse_duration(config.retry_delay.as_deref(), DEFAULT_RETRY_DELAY); + let payload_format = PayloadFormat::from_config(config.payload_format.as_deref()); + + // Build circuit breaker from config + let cb_threshold = config + .circuit_breaker_threshold + .unwrap_or(DEFAULT_CIRCUIT_BREAKER_THRESHOLD); + let cb_cool_down = parse_duration( + config.circuit_breaker_cool_down.as_deref(), + DEFAULT_CIRCUIT_COOL_DOWN, + ); + + InfluxDbSink { + id, + config, + client: None, + write_url: None, + messages_attempted: AtomicU64::new(0), + write_success: AtomicU64::new(0), + write_errors: AtomicU64::new(0), + verbose, + retry_delay, + payload_format, + circuit_breaker: Arc::new(CircuitBreaker::new(cb_threshold, cb_cool_down)), + } + } + + fn build_raw_client(&self) -> Result<reqwest::Client, Error> { + let timeout = parse_duration(self.config.timeout.as_deref(), DEFAULT_TIMEOUT); + reqwest::Client::builder() + .timeout(timeout) + .build() + .map_err(|e| Error::InitError(format!("Failed to create HTTP client: {e}"))) + } + + fn build_write_url(&self) -> Result<Url, Error> { + let base = self.config.url.trim_end_matches('/'); + let mut url = Url::parse(&format!("{base}/api/v2/write")) + .map_err(|e| Error::InvalidConfigValue(format!("Invalid InfluxDB URL: {e}")))?; + + let precision = self + .config + .precision + .as_deref() + .unwrap_or(DEFAULT_PRECISION); + url.query_pairs_mut() + .append_pair("org", &self.config.org) + .append_pair("bucket", &self.config.bucket) + .append_pair("precision", precision); + + Ok(url) + } + + fn build_health_url(&self) -> Result<Url, Error> { + let base = self.config.url.trim_end_matches('/'); + Url::parse(&format!("{base}/health")) + .map_err(|e| Error::InvalidConfigValue(format!("Invalid InfluxDB URL: {e}"))) + } + + fn get_client(&self) -> Result<&ClientWithMiddleware, Error> { + self.client + .as_ref() + .ok_or_else(|| Error::Connection("InfluxDB client is not initialized".to_string())) + } + + fn measurement(&self) -> &str { + self.config + .measurement + .as_deref() + .unwrap_or("iggy_messages") + } + + fn payload_format(&self) -> PayloadFormat { + self.payload_format + } + + fn timestamp_precision(&self) -> &str { + self.config + .precision + .as_deref() + .unwrap_or(DEFAULT_PRECISION) + } + + fn get_max_retries(&self) -> u32 { + self.config + .max_retries + .unwrap_or(DEFAULT_MAX_RETRIES) + .max(1) + } + + fn to_precision_timestamp(&self, micros: u64) -> u64 { + match self.timestamp_precision() { + "ns" => micros.saturating_mul(1_000), + "us" => micros, + "ms" => micros / 1_000, + "s" => micros / 1_000_000, + _ => micros, + } + } + + /// Serialise one message as a line-protocol line, appending directly into + /// `buf` with no intermediate `Vec<String>` for tags or fields. + /// + /// # Allocation budget (per message, happy path) + /// - Zero `Vec` allocations for tags or fields. + /// - Zero per-tag/per-field `format!` allocations. + /// - One `Vec<u8>` for `payload_bytes` (unavoidable — payload must be + /// decoded/serialised before it can be escaped into the buffer). + /// - The caller's `buf` grows in place; if it was pre-allocated with + /// `with_capacity` it will not reallocate for typical message sizes. + fn append_line( + &self, + buf: &mut String, + topic_metadata: &TopicMetadata, + messages_metadata: &MessagesMetadata, + message: &ConsumedMessage, + ) -> Result<(), Error> { + let include_metadata = self.config.include_metadata.unwrap_or(true); + let include_checksum = self.config.include_checksum.unwrap_or(true); + let include_origin_timestamp = self.config.include_origin_timestamp.unwrap_or(true); + let include_stream_tag = self.config.include_stream_tag.unwrap_or(true); + let include_topic_tag = self.config.include_topic_tag.unwrap_or(true); + let include_partition_tag = self.config.include_partition_tag.unwrap_or(true); + + // ── Measurement ────────────────────────────────────────────────────── + write_measurement(buf, self.measurement()); + + // ── Tag set ────────────────────────────────────────────────────────── + // Tags are written as ",key=value" pairs directly into buf. + // The offset tag is always present — it makes every point unique in + // InfluxDB's deduplication key (measurement + tag set + timestamp), + // regardless of precision or how many messages share a timestamp. + if include_metadata && include_stream_tag { + buf.push_str(",stream="); + write_tag_value(buf, &topic_metadata.stream); + } + if include_metadata && include_topic_tag { + buf.push_str(",topic="); + write_tag_value(buf, &topic_metadata.topic); + } + if include_metadata && include_partition_tag { + use std::fmt::Write as _; + write!(buf, ",partition={}", messages_metadata.partition_id) + .expect("write to String is infallible"); + } + // offset tag — always written, ensures point uniqueness + { + use std::fmt::Write as _; + write!(buf, ",offset={}", message.offset).expect("write to String is infallible"); + } + + // ── Field set ──────────────────────────────────────────────────────── + // First field: no leading comma. All subsequent fields: leading comma. + buf.push(' '); + + buf.push_str("message_id=\""); + write_field_string(buf, &message.id.to_string()); + buf.push('"'); + + // offset as a numeric field (queryable in Flux) in addition to the tag + { + use std::fmt::Write as _; + write!(buf, ",offset={}u", message.offset).expect("write to String is infallible"); + } + + // Optional metadata fields written when the corresponding tag is + // disabled (so the value is still queryable as a field). + if include_metadata && !include_stream_tag { + buf.push_str(",iggy_stream=\""); + write_field_string(buf, &topic_metadata.stream); + buf.push('"'); + } + if include_metadata && !include_topic_tag { + buf.push_str(",iggy_topic=\""); + write_field_string(buf, &topic_metadata.topic); + buf.push('"'); + } + if include_metadata && !include_partition_tag { + use std::fmt::Write as _; + write!( + buf, + ",iggy_partition={}u", + messages_metadata.partition_id as u64 + ) + .expect("write to String is infallible"); + } + if include_checksum { + use std::fmt::Write as _; + write!(buf, ",iggy_checksum={}u", message.checksum) + .expect("write to String is infallible"); + } + if include_origin_timestamp { + use std::fmt::Write as _; + write!(buf, ",iggy_origin_timestamp={}u", message.origin_timestamp) + .expect("write to String is infallible"); + } + + // ── Payload field ──────────────────────────────────────────────────── + match self.payload_format() { + PayloadFormat::Json => { + // Fast path: if the payload is already a parsed simd_json value, + // serialise directly to a compact string — one pass, no bytes + // round-trip. Avoids: simd_json→bytes, bytes→serde_json::Value, + // serde_json::Value→string (three allocating passes per message). + // + // Fallback: any other Payload variant (Raw bytes that happen to + // contain JSON, Text, etc.) goes through try_to_bytes() first. + let compact = match &message.payload { + iggy_connector_sdk::Payload::Json(value) => simd_json::to_string(value) + .map_err(|e| { + Error::CannotStoreData(format!("Failed to serialize JSON payload: {e}")) + })?, + _ => { + let bytes = message.payload.try_to_bytes().map_err(|e| { + Error::CannotStoreData(format!( + "Failed to convert payload to bytes: {e}" + )) + })?; + // Validate that the bytes are actually JSON before + // writing them into the line-protocol field. + let value: serde_json::Value = + serde_json::from_slice(&bytes).map_err(|e| { + Error::CannotStoreData(format!( + "Payload format is json but payload is invalid JSON: {e}" + )) + })?; + serde_json::to_string(&value).map_err(|e| { + Error::CannotStoreData(format!("Failed to serialize JSON payload: {e}")) + })? + } + }; + buf.push_str(",payload_json=\""); + write_field_string(buf, &compact); + buf.push('"'); + } + PayloadFormat::Text => { + let payload_bytes = message.payload.try_to_bytes().map_err(|e| { + Error::CannotStoreData(format!("Failed to convert payload to bytes: {e}")) + })?; + let text = String::from_utf8(payload_bytes).map_err(|e| { + Error::CannotStoreData(format!( + "Payload format is text but payload is invalid UTF-8: {e}" + )) + })?; + buf.push_str(",payload_text=\""); + write_field_string(buf, &text); + buf.push('"'); + } + PayloadFormat::Base64 => { + let payload_bytes = message.payload.try_to_bytes().map_err(|e| { + Error::CannotStoreData(format!("Failed to convert payload to bytes: {e}")) + })?; + let encoded = general_purpose::STANDARD.encode(&payload_bytes); + buf.push_str(",payload_base64=\""); + write_field_string(buf, &encoded); + buf.push('"'); + } + } + + // ── Timestamp ──────────────────────────────────────────────────────── + // message.timestamp is microseconds since Unix epoch. + // Fall back to now() when unset (0) so points are not stored at the + // Unix epoch (year 1970), which falls outside every range(start:-1h). + let base_micros = if message.timestamp == 0 { + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_micros() as u64 + } else { + message.timestamp + }; + let ts = self.to_precision_timestamp(base_micros); + + { + use std::fmt::Write as _; + write!(buf, " {ts}").expect("write to String is infallible"); + } + + debug!( + "InfluxDB sink ID: {} point — offset={}, raw_ts={}, influx_ts={ts}", + self.id, message.offset, message.timestamp + ); + + Ok(()) + } + + async fn process_batch( + &self, + topic_metadata: &TopicMetadata, + messages_metadata: &MessagesMetadata, + messages: &[ConsumedMessage], + ) -> Result<(), Error> { + if messages.is_empty() { + return Ok(()); + } + + // Single buffer for the entire batch — reused across all messages. + // Pre-allocate a generous estimate (256 bytes per message) to avoid + // reallocation in the common case. The buffer is passed into + // append_line() which writes each line directly, with '\n' separators + // between lines. No per-message String is allocated. + let mut body = String::with_capacity(messages.len() * 256); + + for (i, message) in messages.iter().enumerate() { + if i > 0 { + body.push('\n'); + } + self.append_line(&mut body, topic_metadata, messages_metadata, message)?; + } + + let client = self.get_client()?; + let url = self.write_url.clone().ok_or_else(|| { + Error::Connection("write_url not initialised — was open() called?".to_string()) + })?; + let token = self.config.token.expose_secret().to_owned(); + + // Convert once before sending — Bytes is reference-counted so any + // retry inside the middleware clones the pointer, not the payload data. + let body: Bytes = Bytes::from(body); + + let response = client + .post(url) + .header("Authorization", format!("Token {token}")) + .header("Content-Type", "text/plain; charset=utf-8") + .body(body) + .send() + .await + .map_err(|e| Error::CannotStoreData(format!("InfluxDB write failed: {e}")))?; + + let status = response.status(); + if status.is_success() { + return Ok(()); + } + + let body_text = response + .text() + .await + .unwrap_or_else(|_| "failed to read response body".to_string()); + + // Use PermanentHttpError for non-transient 4xx (400 Bad Request, 422 + // schema conflict, etc.) so consume() can skip the circuit breaker for + // these — they indicate a data/schema issue, not an infrastructure one. + if iggy_connector_sdk::retry::is_transient_status(status) { + Err(Error::CannotStoreData(format!( + "InfluxDB write failed with status {status}: {body_text}" + ))) + } else { + Err(Error::PermanentHttpError(format!( + "InfluxDB write failed with status {status}: {body_text}" + ))) + } + } +} + +// --------------------------------------------------------------------------- +// Sink trait implementation +// --------------------------------------------------------------------------- + +#[async_trait] +impl Sink for InfluxDbSink { + async fn open(&mut self) -> Result<(), Error> { + info!( + "Opening InfluxDB sink connector with ID: {}. Bucket: {}, org: {}", + self.id, self.config.bucket, self.config.org + ); + + // Build the raw client first and use it for the startup connectivity + // check. The connectivity retry loop uses separate delay bounds + // (open_retry_max_delay) from the per-write middleware retries, so + // we keep them independent rather than routing health checks through + // the write-tuned middleware. + let raw_client = self.build_raw_client()?; + let health_url = self.build_health_url()?; + check_connectivity_with_retry( + &raw_client, + health_url, + "InfluxDB sink", + self.id, + &ConnectivityConfig { + max_open_retries: self + .config + .max_open_retries + .unwrap_or(DEFAULT_MAX_OPEN_RETRIES), + open_retry_max_delay: parse_duration( + self.config.open_retry_max_delay.as_deref(), + DEFAULT_OPEN_RETRY_MAX_DELAY, + ), + retry_delay: self.retry_delay, + }, + ) + .await?; + + // Wrap in the retry middleware for all subsequent write operations. + // The middleware handles transient 429 / 5xx retries with + // exponential back-off, jitter, and Retry-After header support. + let max_retries = self.get_max_retries(); + let write_retry_max_delay = parse_duration( + self.config.retry_max_delay.as_deref(), + DEFAULT_RETRY_MAX_DELAY, + ); + self.client = Some(build_retry_client( + raw_client, + max_retries, + self.retry_delay, + write_retry_max_delay, + "InfluxDB", + )); + + // Cache once — both are derived purely from config fields that + // never change at runtime. + self.write_url = Some(self.build_write_url()?); + + info!( + "InfluxDB sink connector with ID: {} opened successfully", + self.id + ); + Ok(()) + } + + async fn consume( + &self, + topic_metadata: &TopicMetadata, + messages_metadata: MessagesMetadata, + messages: Vec<ConsumedMessage>, + ) -> Result<(), Error> { + let batch_size = self.config.batch_size.unwrap_or(500) as usize; + let total_messages = messages.len(); + + // Skip writes entirely if circuit breaker is open + if self.circuit_breaker.is_open().await { + warn!( + "InfluxDB sink ID: {} — circuit breaker is OPEN. \ + Skipping {} messages to avoid hammering a down InfluxDB.", + self.id, total_messages + ); + // Return an error so the runtime knows messages were not written + return Err(Error::CannotStoreData( + "Circuit breaker is open — InfluxDB write skipped".to_string(), + )); + } + + // Collect the first batch error rather than silently dropping + let mut first_error: Option<Error> = None; + + for batch in messages.chunks(batch_size.max(1)) { + match self + .process_batch(topic_metadata, &messages_metadata, batch) + .await + { + Ok(()) => { + // Successful write — reset circuit breaker + self.circuit_breaker.record_success(); Review Comment: within a single `consume()` call, if batch 1 fails but batch 2 succeeds, `record_success()` here resets the circuit breaker's failure counter. the circuit never trips if at least one batch per call succeeds - repeated failures on the same data are masked. fix: move `record_success()` after the loop, guarded by `first_error.is_none()`. ########## core/connectors/sources/influxdb_source/src/lib.rs: ########## @@ -0,0 +1,1241 @@ +/* Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +use async_trait::async_trait; +use base64::{Engine as _, engine::general_purpose}; +use csv::StringRecord; +use iggy_common::serde_secret::serialize_secret; +use iggy_common::{DateTime, Utc}; +use iggy_connector_sdk::retry::{ + CircuitBreaker, ConnectivityConfig, build_retry_client, check_connectivity_with_retry, + parse_duration, +}; +use iggy_connector_sdk::{ + ConnectorState, Error, ProducedMessage, ProducedMessages, Schema, Source, source_connector, +}; +use regex::Regex; +use reqwest::Url; +use reqwest_middleware::ClientWithMiddleware; +use secrecy::{ExposeSecret, SecretString}; +use serde::{Deserialize, Serialize}; +use serde_json::json; +use std::collections::HashMap; +use std::sync::Arc; +use std::sync::OnceLock; +use std::time::Duration; +use tokio::sync::Mutex; +use tracing::{debug, error, info, warn}; +use uuid::Uuid; + +source_connector!(InfluxDbSource); + +const CONNECTOR_NAME: &str = "InfluxDB source"; +const DEFAULT_MAX_RETRIES: u32 = 3; +const DEFAULT_RETRY_DELAY: &str = "1s"; +const DEFAULT_POLL_INTERVAL: &str = "5s"; +const DEFAULT_TIMEOUT: &str = "10s"; +const DEFAULT_CURSOR: &str = "1970-01-01T00:00:00Z"; +// Maximum attempts for open() connectivity retries +const DEFAULT_MAX_OPEN_RETRIES: u32 = 10; +// Cap for exponential backoff in open() — never wait longer than this +const DEFAULT_OPEN_RETRY_MAX_DELAY: &str = "60s"; +// Cap for exponential backoff on per-query retries — kept short so a +// transient InfluxDB blip does not stall polling for too long +const DEFAULT_RETRY_MAX_DELAY: &str = "5s"; +// How many consecutive poll failures open the circuit breaker +const DEFAULT_CIRCUIT_BREAKER_THRESHOLD: u32 = 5; +// How long the circuit stays open before allowing a probe attempt +const DEFAULT_CIRCUIT_COOL_DOWN: &str = "30s"; + +/// RFC 3339 / ISO 8601 datetime pattern. +/// Matches the forms InfluxDB stores in `_time`: +/// "2024-01-15T10:30:00Z" +/// "2024-01-15T10:30:00.123456789Z" +/// "2024-01-15T10:30:00+05:30" +/// Intentionally strict: only digits, T, Z, colon, dot, plus, hyphen. +/// Any Flux syntax character (pipe, quote, paren, space, slash) is rejected. +static CURSOR_RE: OnceLock<Regex> = OnceLock::new(); + +// --------------------------------------------------------------------------- +// Main connector structs +// --------------------------------------------------------------------------- + +#[derive(Debug)] +pub struct InfluxDbSource { + pub id: u32, + config: InfluxDbSourceConfig, + /// `None` until `open()` is called. Wraps `reqwest::Client` with + /// [`HttpRetryMiddleware`] so retry/back-off/jitter is handled + /// transparently by the middleware stack instead of a hand-rolled loop. + client: Option<ClientWithMiddleware>, + state: Mutex<State>, + verbose: bool, + retry_delay: Duration, + poll_interval: Duration, + /// Resolved once in `new()` — avoids a `to_ascii_lowercase()` allocation + /// on every message in the hot path. + payload_format: PayloadFormat, + circuit_breaker: Arc<CircuitBreaker>, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct InfluxDbSourceConfig { + pub url: String, + pub org: String, + #[serde(serialize_with = "serialize_secret")] + pub token: SecretString, + pub query: String, + pub poll_interval: Option<String>, + pub batch_size: Option<u32>, + pub cursor_field: Option<String>, + pub initial_offset: Option<String>, + pub payload_column: Option<String>, + pub payload_format: Option<String>, + pub include_metadata: Option<bool>, + pub verbose_logging: Option<bool>, + pub max_retries: Option<u32>, + pub retry_delay: Option<String>, + pub timeout: Option<String>, + // How many times open() will retry before giving up + pub max_open_retries: Option<u32>, + // Upper cap on open() backoff delay — can be set high (e.g. "60s") for + // patient startup without affecting per-query retry behaviour + pub open_retry_max_delay: Option<String>, + // Upper cap on per-query retry backoff — kept short so a transient blip + // does not stall polling; independent of open_retry_max_delay + pub retry_max_delay: Option<String>, + // Circuit breaker configuration + pub circuit_breaker_threshold: Option<u32>, + pub circuit_breaker_cool_down: Option<String>, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +enum PayloadFormat { + #[default] + Json, + Text, + Raw, +} + +impl PayloadFormat { + fn from_config(value: Option<&str>) -> Self { + match value.map(|v| v.to_ascii_lowercase()).as_deref() { + Some("text") | Some("utf8") => PayloadFormat::Text, + Some("raw") | Some("base64") => PayloadFormat::Raw, + Some("json") => PayloadFormat::Json, + other => { + warn!( + "Unrecognized payload_format value {:?}, falling back to JSON. \ + Valid values are: \"json\", \"text\", \"utf8\", \"base64\", \"raw\".", + other + ); + PayloadFormat::Json + } + } + } + + fn schema(self) -> Schema { + match self { + PayloadFormat::Json => Schema::Json, + PayloadFormat::Text => Schema::Text, + PayloadFormat::Raw => Schema::Raw, + } + } +} + +#[derive(Debug, Serialize, Deserialize)] +struct State { + last_poll_time: DateTime<Utc>, + last_timestamp: Option<String>, + processed_rows: u64, + /// How many rows at `last_timestamp` have already been delivered downstream. + /// + /// When the user's Flux query uses `>= $cursor`, consecutive polls may + /// return the same rows for the current cursor timestamp. This counter + /// lets `poll_messages` skip those already-delivered rows and inflate + /// `$limit` accordingly, preventing both duplicates and data loss at + /// batch boundaries where multiple rows share the same timestamp. + /// + /// `#[serde(default)]` keeps existing persisted state files forward-compatible: + /// the field defaults to 0 when the state was saved by an older version. + #[serde(default)] + cursor_row_count: u64, +} + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +fn parse_scalar(value: &str) -> serde_json::Value { + if value.is_empty() { + return serde_json::Value::Null; + } + if let Ok(v) = value.parse::<bool>() { + return serde_json::Value::Bool(v); + } + if let Ok(v) = value.parse::<i64>() { + return serde_json::Value::Number(v.into()); + } + if let Ok(v) = value.parse::<f64>() + && let Some(number) = serde_json::Number::from_f64(v) + { + return serde_json::Value::Number(number); + } + serde_json::Value::String(value.to_string()) +} + +/// Recognise an InfluxDB CSV header row. +/// +/// A header row must contain a `_time` column. The `_value` column is +/// intentionally **not** required: Flux aggregation queries (`count()`, +/// `mean()`, `group()`) produce result tables with columns like `_count` or +/// `_mean` instead of `_value`. Requiring `_value` would cause those header +/// rows to be missed, silently skipping all subsequent data rows until the +/// next recognised header. +/// +/// InfluxDB annotation rows (`#group`, `#datatype`, `#default`) are already +/// filtered out earlier in `parse_csv_rows` by the leading-`#` check, so +/// they will never reach this function. +fn is_header_record(record: &StringRecord) -> bool { + record.iter().any(|v| v == "_time") +} + +/// Compare two RFC 3339 timestamp strings chronologically. +/// +/// InfluxDB strips trailing fractional-second zeros, producing timestamps like +/// `"2026-03-18T12:00:00.60952Z"` (= 609520µs). A naïve `>` string comparison +/// treats this as *greater* than `"2026-03-18T12:00:00.609521Z"` because `'Z'` +/// (ASCII 90) > `'1'` (ASCII 49), even though the former is chronologically +/// *earlier*. Always parse to `DateTime<Utc>` so the comparison is correct. +fn is_timestamp_after(a: &str, b: &str) -> bool { + match (a.parse::<DateTime<Utc>>(), b.parse::<DateTime<Utc>>()) { + (Ok(dt_a), Ok(dt_b)) => dt_a > dt_b, + _ => a > b, + } +} + +// --------------------------------------------------------------------------- +// InfluxDbSource implementation +// --------------------------------------------------------------------------- + +impl InfluxDbSource { + pub fn new(id: u32, config: InfluxDbSourceConfig, state: Option<ConnectorState>) -> Self { + let verbose = config.verbose_logging.unwrap_or(false); + let retry_delay = parse_duration(config.retry_delay.as_deref(), DEFAULT_RETRY_DELAY); + let poll_interval = parse_duration(config.poll_interval.as_deref(), DEFAULT_POLL_INTERVAL); + let payload_format = PayloadFormat::from_config(config.payload_format.as_deref()); + + // Build circuit breaker from config + let cb_threshold = config + .circuit_breaker_threshold + .unwrap_or(DEFAULT_CIRCUIT_BREAKER_THRESHOLD); + let cb_cool_down = parse_duration( + config.circuit_breaker_cool_down.as_deref(), + DEFAULT_CIRCUIT_COOL_DOWN, + ); + + let restored_state = state Review Comment: if the `State` struct gains or removes a field in a future version, `deserialize()` returns `None` and the connector silently starts polling from `1970-01-01T00:00:00Z` - full re-delivery with no alert. should at minimum log at error level with the connector ID, or fail `open()` so the operator knows the cursor was reset. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
