6.23. Regular Expression Grab Bag
We have found these regular expressions
useful or interesting:
- Swap first two words
-
s/(\S+)(\s+)(\S+)/$3$2$1/
- Keyword = Value
-
m/^(\w+)\s*=\s*(.*?)\s*$/ # keyword is $1, value is $2
- Line of at least 80 characters
-
m/.{80,}/
length( ) >= 80 # ok, not a regex
- MM/DD/YY HH:MM:SS
-
m|(\d+)/(\d+)/(\d+) (\d+):(\d+):(\d+)|
- Changing directories
-
s(/usr/bin)(/usr/local/bin)g
- Expanding %7E (hex) escapes
-
s/%([0-9A-Fa-f][0-9A-Fa-f])/chr(hex($1))/ge
- Deleting C comments (imperfectly)
-
s{
/* # Match the opening delimiter
.*? # Match a minimal number of characters
*/ # Match the closing delimiter
}{ }gsx;
- Removing leading and trailing whitespace
-
s/^\s+//;
s/\s+$//;
- Turning \ followed by n into a real newline
-
s/\\n/\n/g;
- Removing package portion of fully qualified symbols
-
s/^.*:://
- Dotted quads (most IP addresses)
-
# XXX: fails on legal IPs 127.1 and 2130706433.
m{
^ ( \d | [01]?\d\d | 2[0-4]\d | 25[0-5] )
\. ( \d | [01]?\d\d | 2[0-4]\d | 25[0-5] )
\. ( \d | [01]?\d\d | 2[0-4]\d | 25[0-5] )
\. ( \d | [01]?\d\d | 2[0-4]\d | 25[0-5] )
$
}x
- Removing leading path from filename
-
s{^.*/}{ }
- Extracting columns setting from TERMCAP
-
$cols = ( ($ENV{TERMCAP} || " ") =~ m/:co#(\d+):/ ) ? $1 : 80;
- Removing directory components from program name and arguments
-
($name = " $0 @ARGV") =~ s{ /\S+/}{ }g;
- Checking your operating system
-
die "This isn't Linux" unless $^O =~ m/linux/i;
- Joining continuation lines in multiline string
-
s/\n\s+/ /g
- Extracting all numbers from a string
-
@nums = m/(\d+\.?\d*|\.\d+)/g;
- Finding all-caps words
-
@capwords = m/(\b\p{ Upper-case Letter }+\b)/g;
- Finding all-lowercase words
-
@lowords = m/(\b\p{ Lower-case Letter }+\b)/g;
- Finding initial-caps word
-
@icwords = m{
( \b
[\p{ Upper-case Letter }\p{ Title-case Letter }]
\p{ Lower-case Letter } *
\b )
}gx;
- Finding links in simple HTML
-
@links = m/<A[^>]+?HREF\s*=\s*["']?([^'" >]+?)['"]?\s*>/ig;
- Finding middle initial in $_
-
$initial = /^\S+\s+(\S)\S*\s+\S/ ? $1 : "";
- Changing double verticle prime pairs to curly quotes
-
s/"([^"]*)"/``$1''/g # old way
# next is unicode only
s/"([^"]*)"/\x{201C}\x{201C}$1\x{201D}\x{201D}/g
- Extracting sentences (double spaces required between each)
-
{ local $/ = "";
while (<>) {
s/\n/ /g;
s/ {3,}/ /g;
push @sentences, m/(\S.*?[!?.])(?= {2}|\Z)/g;
}
}
- YYYY-MM-DD
-
m/\b(\d{4})-(\d\d)-(\d\d)\b/ # YYYY in $1, MM in $2, DD in $3
- North American telephone numbers
-
m/ ^
(?:
1 \s (?: \d\d\d \s)? # 1, or 1 and area code
| # ... or ...
\(\d\d\d\) \s # area code with parens
| # ... or ...
(?: \+\d\d?\d? \s)? # optional +country code
\d\d\d ([\s\-]) # and area code
)
\d\d\d (\s|\1) # prefix (and area code separator)
\d\d\d\d # exchange
$
/x
- Exclamations
-
m/\boh\s+my\s+gh?o(d(dess(es)?|s?)|odness|sh)\b/i
- Extracting lines regardless of line terminator
-
push(@lines, $1) while $input =~ s{
^ # gobble from front
(
. # begin $1: any single char (/s)
?* # but minimally matching even none
)
(?: # make capturing if saving terminators
\x0D \x0A # CRLF
| \x0A # LF
| \x0D # CR
| \x0C # FF
# (see http://www.unicode.org/reports/tr13/tr13-9.html)
| \x{2028} # Unicode LS
| \x{2029} # Unicode PS
)
}{ }sx; # consumes $input
Or use split:
@lines = split m{
(?: # make capturing if saving terminators
\x0D \x0A # CRLF
| \x0A # LF
| \x0D # CR
| \x0C # FF
# (see http://www.unicode.org/reports/tr13/tr13-9.html)
| \x{2028} # Unicode LS
| \x{2029} # Unicode PS
)
}x, $input;
| | | 6.22. Program: tcgrep | | 7. File Access |
Copyright © 2003 O'Reilly & Associates. All rights reserved.
|
|