I've solved my problem with captures, but I don't understand how to get
positions of matches:
my $regex = qr
{
(?i) # Case-insensitive
(
[\x{2022}\*]* # Any number of bullet or asterisk characters
[1-9]+ # One or more digits 1-9
\s* # Any number of spaces
(?:
\-|\,|through|and # Zero or one dash, comma or "through" or "and"
)*?
\s* # Any number of spaces
)+
}msx;
my $regex1 = qr
{
(?i) # Case-insensitive
(
(?:
figure | fig[s]?[\.]?? | table | box | chapter | diagram | scheme |
chart | plate | appendix | part | section | footnote | [p]{1,2}\.?? | page
)
\s* # Any number of spaces
(?:
[0-9]+ # One or more digits 1-9
\s* # Any number of spaces
(?:
\-|\,|through|and|\s # Zero or one dash, comma or "through" or "and"
)*
\s* # Any number of spaces
)+
)
}msx;
my @vancouverCites =
(
"[4 5, Figure 3; 12 Chapter 4-5]",
"[8, Chapter 10]",
"[9 through 15, pp. 35-46]",
"[11, pp. 35 Through 46]",
"[see 1, 4]",
"[e.g. 2, 5]",
"[e.g. •2, ••5]",
"[e.g. *2, **5]",
"[for example 1,17]",
"[2, 9]",
);
foreach my $c (@vancouverCites)
{
$c =~ s/$regex1//g;
print "Text=\"$c\" ";
my @matches = $c =~ /$regex/g;
foreach my $arr (@matches)
{
print "Array = $arr " if defined $arr;
}
print " pos=$-[0] - $+[0]", "\n";
}
Output:
Text="[4 5, ; 12 ]" Array = 5 Array = 12 pos=12 - 11
Text="[8, ]" Array = 8 pos=5 - 2
Text="[9 through 15, ]" Array = 9 Array = 15 pos=16 - 13
Text="[11, ]" Array = 11 pos=6 - 3
Text="[see 1, 4]" Array = 1 Array = 4 pos=10 - 9
Text="[e.g. 2, 5]" Array = 2 Array = 5 pos=11 - 10
Text="[e.g. •2, ••5]" Array = •2 Array = ••5 pos=14 - 13
Text="[e.g. *2, **5]" Array = *2 Array = **5 pos=14 - 13
Text="[for example 1,17]" Array = 1 Array = 17 pos=18 - 17
Text="[2, 9]" Array = 2 Array = 9 pos=6 - 5
Why $-[0] and $+[0] are *so strange*?