nickva commented on code in PR #5983: URL: https://github.com/apache/couchdb/pull/5983#discussion_r3223640611
########## src/couch_replicator/src/couch_replicator_connect.erl: ########## @@ -0,0 +1,260 @@ +% Licensed under the Apache License, Version 2.0 (the "License"); you may not +% use this file except in compliance with the License. You may obtain a copy of +% the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +% WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +% License for the specific language governing permissions and limitations under +% the License. + +%% This module implements the connect_to configuration option, which allows +%% routing replication requests through proxies or rewriting connection ports. +%% Similar to curl's --connect-to option. +%% +%% Flow: +%% 1. init/0 - Called at startup to parse and cache connect_to config +%% 2. apply_connect_to/2 - Called for each replication request: +%% a. Parse URL to extract host and port +%% b. resolve_connection/2 - Check if host:port matches any override pattern +%% c. If match found: +%% - Reconstruct URL with target port +%% - Add ibrowse connect_to option (target host) +%% - Add SNI option for HTTPS (original host) +%% d. Return modified URL and options +%% +%% Configuration format: host:port:target_host:target_port +%% Example: *.example.com:443:proxy.internal:8443 +%% +%% Pattern matching: +%% - Exact hostnames: foo.example.com +%% - Leading wildcards: *.example.com (matches sub.example.com, not example.com) +%% - Case-insensitive +%% - Port must match exactly + +-module(couch_replicator_connect). + +-include_lib("ibrowse/include/ibrowse.hrl"). + +-export([ + init/0, + apply_connect_to/2 +]). + +-ifdef(TEST). +-export([ + parse_config/1, + match_host_pattern/2, + get_overrides/0, + resolve_connection/2, + is_ip_address/1 +]). +-endif. + +-type connect_to_override() :: { + PatternHost :: binary(), + PatternPort :: integer(), + TargetHost :: binary(), + TargetPort :: integer() +}. + +-define(CONNECT_TO_KEY, {?MODULE, connect_to}). + +%% Initialize connect_to overrides cache +-spec init() -> ok. +init() -> + Overrides = + case config:get("replicator", "connect_to", undefined) of + undefined -> []; + ConfigStr -> parse_config(ConfigStr) + end, + persistent_term:put(?CONNECT_TO_KEY, Overrides), + ok. + +%% Resolve connection override for a host:port pair. +%% String/binary conversions are necessary because: +%% - Input: ibrowse provides Host as string +%% - Internal: overrides stored as binaries for efficient pattern matching +%% - Output: ibrowse connect_to option requires string +-spec resolve_connection(string(), integer()) -> + {string(), integer(), string()} | not_found. +resolve_connection(Host, Port) -> + case find_override(list_to_binary(Host), Port, get_overrides()) of + {ok, {TargetHost, TargetPort}} -> + {binary_to_list(TargetHost), TargetPort, Host}; + not_found -> + not_found + end. + +-spec get_overrides() -> [connect_to_override()]. +get_overrides() -> + case persistent_term:get(?CONNECT_TO_KEY, not_initialized) of + not_initialized -> + % not initialized yet, fall back to reading config + case config:get("replicator", "connect_to", undefined) of + undefined -> []; + ConfigStr -> parse_config(ConfigStr) + end; + Overrides -> + Overrides + end. + +-spec parse_config(string()) -> [connect_to_override()]. +parse_config(ConfigStr) -> + ConfigBin = list_to_binary(ConfigStr), + Entries = binary:split(ConfigBin, <<",">>, [global, trim]), + lists:filtermap(fun parse_entry/1, Entries). + +% Format: HOST:PORT:TARGET:TARGET_PORT (matches curl --connect-to) +% Examples: +% *.example.com:443:192.168.1.1:8443 +% *.example.com:443:[2001:db8::1]:8443 +% IPv6 addresses in targets must be enclosed in brackets +parse_entry(<<>>) -> + false; +parse_entry(Entry0) -> + Entry = string:trim(Entry0), + % Regex: HOST:PORT:TARGET:TARGET_PORT where TARGET can be [IPv6] + % Reject IPv6 patterns (starting with [), ensure non-empty captures + Pattern = "^([^:\\[]+):([0-9]+):([^:]+|\\[[^\\]]+\\]):([0-9]+)$", Review Comment: It's a bit of a gnarly expression but I couldn't come up with anything smaller, after staring at it for a while it does seems right ########## src/couch_replicator/src/couch_replicator_connect.erl: ########## @@ -0,0 +1,260 @@ +% Licensed under the Apache License, Version 2.0 (the "License"); you may not +% use this file except in compliance with the License. You may obtain a copy of +% the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +% WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +% License for the specific language governing permissions and limitations under +% the License. + +%% This module implements the connect_to configuration option, which allows +%% routing replication requests through proxies or rewriting connection ports. +%% Similar to curl's --connect-to option. +%% +%% Flow: +%% 1. init/0 - Called at startup to parse and cache connect_to config +%% 2. apply_connect_to/2 - Called for each replication request: +%% a. Parse URL to extract host and port +%% b. resolve_connection/2 - Check if host:port matches any override pattern +%% c. If match found: +%% - Reconstruct URL with target port +%% - Add ibrowse connect_to option (target host) +%% - Add SNI option for HTTPS (original host) +%% d. Return modified URL and options +%% +%% Configuration format: host:port:target_host:target_port +%% Example: *.example.com:443:proxy.internal:8443 +%% +%% Pattern matching: +%% - Exact hostnames: foo.example.com +%% - Leading wildcards: *.example.com (matches sub.example.com, not example.com) +%% - Case-insensitive +%% - Port must match exactly + +-module(couch_replicator_connect). + +-include_lib("ibrowse/include/ibrowse.hrl"). + +-export([ + init/0, + apply_connect_to/2 +]). + +-ifdef(TEST). +-export([ + parse_config/1, + match_host_pattern/2, + get_overrides/0, + resolve_connection/2, + is_ip_address/1 +]). +-endif. + +-type connect_to_override() :: { + PatternHost :: binary(), + PatternPort :: integer(), + TargetHost :: binary(), + TargetPort :: integer() +}. + +-define(CONNECT_TO_KEY, {?MODULE, connect_to}). + +%% Initialize connect_to overrides cache +-spec init() -> ok. +init() -> + Overrides = + case config:get("replicator", "connect_to", undefined) of + undefined -> []; + ConfigStr -> parse_config(ConfigStr) + end, + persistent_term:put(?CONNECT_TO_KEY, Overrides), + ok. + +%% Resolve connection override for a host:port pair. +%% String/binary conversions are necessary because: +%% - Input: ibrowse provides Host as string +%% - Internal: overrides stored as binaries for efficient pattern matching +%% - Output: ibrowse connect_to option requires string +-spec resolve_connection(string(), integer()) -> + {string(), integer(), string()} | not_found. +resolve_connection(Host, Port) -> + case find_override(list_to_binary(Host), Port, get_overrides()) of + {ok, {TargetHost, TargetPort}} -> + {binary_to_list(TargetHost), TargetPort, Host}; + not_found -> + not_found + end. + +-spec get_overrides() -> [connect_to_override()]. +get_overrides() -> + case persistent_term:get(?CONNECT_TO_KEY, not_initialized) of + not_initialized -> + % not initialized yet, fall back to reading config + case config:get("replicator", "connect_to", undefined) of + undefined -> []; + ConfigStr -> parse_config(ConfigStr) + end; + Overrides -> + Overrides + end. + +-spec parse_config(string()) -> [connect_to_override()]. +parse_config(ConfigStr) -> + ConfigBin = list_to_binary(ConfigStr), + Entries = binary:split(ConfigBin, <<",">>, [global, trim]), + lists:filtermap(fun parse_entry/1, Entries). + +% Format: HOST:PORT:TARGET:TARGET_PORT (matches curl --connect-to) +% Examples: +% *.example.com:443:192.168.1.1:8443 +% *.example.com:443:[2001:db8::1]:8443 +% IPv6 addresses in targets must be enclosed in brackets +parse_entry(<<>>) -> + false; +parse_entry(Entry0) -> + Entry = string:trim(Entry0), + % Regex: HOST:PORT:TARGET:TARGET_PORT where TARGET can be [IPv6] + % Reject IPv6 patterns (starting with [), ensure non-empty captures + Pattern = "^([^:\\[]+):([0-9]+):([^:]+|\\[[^\\]]+\\]):([0-9]+)$", + case re:run(Entry, Pattern, [{capture, all_but_first, binary}]) of + {match, [PatternHost, PatternPortBin, TargetHost0, TargetPortBin]} -> + % Regex guarantees non-empty hosts and numeric ports + PatternPort = binary_to_integer(PatternPortBin), + TargetPort = binary_to_integer(TargetPortBin), + % Strip brackets from IPv6 addresses in targets + TargetHost = string:trim(TargetHost0, both, "[]"), Review Comment: Even if we strip `[]` bracket from an ipv6 address tcp, when we pass an ip like "::1" to `gen_tcp|ssl:connect/3` I think it will still fail, it can only handle a tuple ``` ([email protected])15> ssl:connect("[2607:f8b0:4023:100d::8b]", 443, [{verify, verify_none}]). {error,nxdomain} ([email protected])16> ssl:connect("2607:f8b0:4023:100d::8b", 443, [{verify, verify_none}]). {error,nxdomain} > ssl:connect({9735,63664,16419,4109,0,0,0,139}, 443, [{verify, verify_none}]). {ok,{sslsocket,{gen_tcp,#Port<0.28>,tls_connection, undefined}, [<0.57527.0>,<0.57526.0>]}} ``` (address is from `dig AAAA google.com`) Wonder if we then take the `TargetHost` and parse with a helper as: ``` parse_target(Bin) -> case inet:parse_strict_address(binary_to_list(Bin)) of {ok, Tuple} -> {ok, Tuple}; _ -> {ok, Bin}; end ``` Then `connect_to` can can either the get a string or a tuple. ########## src/couch_replicator/src/couch_replicator_connect.erl: ########## @@ -0,0 +1,260 @@ +% Licensed under the Apache License, Version 2.0 (the "License"); you may not +% use this file except in compliance with the License. You may obtain a copy of +% the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +% WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +% License for the specific language governing permissions and limitations under +% the License. + +%% This module implements the connect_to configuration option, which allows +%% routing replication requests through proxies or rewriting connection ports. +%% Similar to curl's --connect-to option. +%% +%% Flow: +%% 1. init/0 - Called at startup to parse and cache connect_to config +%% 2. apply_connect_to/2 - Called for each replication request: +%% a. Parse URL to extract host and port +%% b. resolve_connection/2 - Check if host:port matches any override pattern +%% c. If match found: +%% - Reconstruct URL with target port +%% - Add ibrowse connect_to option (target host) +%% - Add SNI option for HTTPS (original host) +%% d. Return modified URL and options +%% +%% Configuration format: host:port:target_host:target_port +%% Example: *.example.com:443:proxy.internal:8443 +%% +%% Pattern matching: +%% - Exact hostnames: foo.example.com +%% - Leading wildcards: *.example.com (matches sub.example.com, not example.com) +%% - Case-insensitive +%% - Port must match exactly + +-module(couch_replicator_connect). + +-include_lib("ibrowse/include/ibrowse.hrl"). + +-export([ + init/0, + apply_connect_to/2 +]). + +-ifdef(TEST). +-export([ + parse_config/1, + match_host_pattern/2, + get_overrides/0, + resolve_connection/2, + is_ip_address/1 +]). +-endif. + +-type connect_to_override() :: { + PatternHost :: binary(), + PatternPort :: integer(), + TargetHost :: binary(), + TargetPort :: integer() +}. + +-define(CONNECT_TO_KEY, {?MODULE, connect_to}). + +%% Initialize connect_to overrides cache +-spec init() -> ok. +init() -> + Overrides = + case config:get("replicator", "connect_to", undefined) of + undefined -> []; + ConfigStr -> parse_config(ConfigStr) + end, + persistent_term:put(?CONNECT_TO_KEY, Overrides), + ok. + +%% Resolve connection override for a host:port pair. +%% String/binary conversions are necessary because: +%% - Input: ibrowse provides Host as string +%% - Internal: overrides stored as binaries for efficient pattern matching +%% - Output: ibrowse connect_to option requires string +-spec resolve_connection(string(), integer()) -> + {string(), integer(), string()} | not_found. +resolve_connection(Host, Port) -> + case find_override(list_to_binary(Host), Port, get_overrides()) of + {ok, {TargetHost, TargetPort}} -> + {binary_to_list(TargetHost), TargetPort, Host}; + not_found -> + not_found + end. + +-spec get_overrides() -> [connect_to_override()]. +get_overrides() -> + case persistent_term:get(?CONNECT_TO_KEY, not_initialized) of + not_initialized -> + % not initialized yet, fall back to reading config + case config:get("replicator", "connect_to", undefined) of + undefined -> []; + ConfigStr -> parse_config(ConfigStr) + end; + Overrides -> + Overrides + end. + +-spec parse_config(string()) -> [connect_to_override()]. +parse_config(ConfigStr) -> + ConfigBin = list_to_binary(ConfigStr), + Entries = binary:split(ConfigBin, <<",">>, [global, trim]), + lists:filtermap(fun parse_entry/1, Entries). + +% Format: HOST:PORT:TARGET:TARGET_PORT (matches curl --connect-to) +% Examples: +% *.example.com:443:192.168.1.1:8443 +% *.example.com:443:[2001:db8::1]:8443 +% IPv6 addresses in targets must be enclosed in brackets +parse_entry(<<>>) -> + false; +parse_entry(Entry0) -> + Entry = string:trim(Entry0), + % Regex: HOST:PORT:TARGET:TARGET_PORT where TARGET can be [IPv6] + % Reject IPv6 patterns (starting with [), ensure non-empty captures + Pattern = "^([^:\\[]+):([0-9]+):([^:]+|\\[[^\\]]+\\]):([0-9]+)$", + case re:run(Entry, Pattern, [{capture, all_but_first, binary}]) of Review Comment: `all_but_first` Hadn't seen that used before, that's neat: > All but the first matching subpattern, that is, all explicitly captured subpatterns, but not the complete matching part of the subject string. This is useful if the regular expression as a whole matches a large part of the subject, but the part you are interested in is in an explicitly captured subpattern. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
