A Regexp holds a regular expression, used to match a pattern
against strings. Regexps are created using the /.../ and
%r{...} literals, and by the Regexp::new
constructor.
Escapes any characters that would have special meaning in a regular
expression. Returns a new escaped string, or self if no characters are
escaped. For any string,
Regexp.escape(str)=~str will be true.
Regexp.escape('\\*?{}.') #=> \\\\\*\?\{\}\.
static VALUE
rb_reg_s_quote(argc, argv)
int argc;
VALUE *argv;
{
VALUE str, kcode;
int kcode_saved = reg_kcode;
rb_scan_args(argc, argv, "11", &str, &kcode);
if (!NIL_P(kcode)) {
rb_set_kcode(StringValuePtr(kcode));
curr_kcode = reg_kcode;
reg_kcode = kcode_saved;
}
StringValue(str);
str = rb_reg_quote(str);
rb_kcode_reset_option();
return str;
}
The first form returns the MatchData object generated by the
last successful pattern match. Equivalent to reading the global variable
$~. The second form returns the nth field in this
MatchData object.
/c(.)t/ =~ 'cat' #=> 0 Regexp.last_match #=> #<MatchData:0x401b3d30> Regexp.last_match(0) #=> "cat" Regexp.last_match(1) #=> "a" Regexp.last_match(2) #=> nil
static VALUE
rb_reg_s_last_match(argc, argv)
int argc;
VALUE *argv;
{
VALUE nth;
if (rb_scan_args(argc, argv, "01", &nth) == 1) {
return rb_reg_nth_match(NUM2INT(nth), rb_backref_get());
}
return match_getter();
}
Constructs a new regular expression from pattern, which can be
either a String or a Regexp (in which case that
regexp's options are propagated, and new options may not be specified
(a change as of Ruby 1.8). If options is a Fixnum, it
should be one or more of the constants Regexp::EXTENDED,
Regexp::IGNORECASE, and Regexp::MULTILINE,
or-ed together. Otherwise, if options is not
nil, the regexp will be case insensitive. The lang
parameter enables multibyte support for the regexp: `n', `N' =
none, `e', `E' = EUC, `s', `S' = SJIS, `u', `U' =
UTF-8.
r1 = Regexp.new('^a-z+:\\s+\w+') #=> /^a-z+:\s+\w+/ r2 = Regexp.new('cat', true) #=> /cat/i r3 = Regexp.new('dog', Regexp::EXTENDED) #=> /dog/x r4 = Regexp.new(r2) #=> /cat/i
static VALUE
rb_reg_initialize_m(argc, argv, self)
int argc;
VALUE *argv;
VALUE self;
{
const char *s;
long len;
int flags = 0;
if (argc == 0 || argc > 3) {
rb_raise(rb_eArgError, "wrong number of arguments");
}
if (TYPE(argv[0]) == T_REGEXP) {
if (argc > 1) {
rb_warn("flags%s ignored", (argc == 3) ? " and encoding": "");
}
rb_reg_check(argv[0]);
flags = RREGEXP(argv[0])->ptr->options & 0xf;
if (FL_TEST(argv[0], KCODE_FIXED)) {
switch (RBASIC(argv[0])->flags & KCODE_MASK) {
case KCODE_NONE:
flags |= 16;
break;
case KCODE_EUC:
flags |= 32;
break;
case KCODE_SJIS:
flags |= 48;
break;
case KCODE_UTF8:
flags |= 64;
break;
default:
break;
}
}
s = RREGEXP(argv[0])->str;
len = RREGEXP(argv[0])->len;
}
else {
if (argc >= 2) {
if (FIXNUM_P(argv[1])) flags = FIX2INT(argv[1]);
else if (RTEST(argv[1])) flags = RE_OPTION_IGNORECASE;
}
if (argc == 3 && !NIL_P(argv[2])) {
char *kcode = StringValuePtr(argv[2]);
flags &= ~0x70;
switch (kcode[0]) {
case 'n': case 'N':
flags |= 16;
break;
case 'e': case 'E':
flags |= 32;
break;
case 's': case 'S':
flags |= 48;
break;
case 'u': case 'U':
flags |= 64;
break;
default:
break;
}
}
s = StringValuePtr(argv[0]);
len = RSTRING(argv[0])->len;
}
rb_reg_initialize(self, s, len, flags);
return self;
}
Escapes any characters that would have special meaning in a regular
expression. Returns a new escaped string, or self if no characters are
escaped. For any string,
Regexp.escape(str)=~str will be true.
Regexp.escape('\\*?{}.') #=> \\\\\*\?\{\}\.
static VALUE
rb_reg_s_quote(argc, argv)
int argc;
VALUE *argv;
{
VALUE str, kcode;
int kcode_saved = reg_kcode;
rb_scan_args(argc, argv, "11", &str, &kcode);
if (!NIL_P(kcode)) {
rb_set_kcode(StringValuePtr(kcode));
curr_kcode = reg_kcode;
reg_kcode = kcode_saved;
}
StringValue(str);
str = rb_reg_quote(str);
rb_kcode_reset_option();
return str;
}
Return a Regexp object that is the union of the given
patterns, i.e., will match any of its parts. The patterns
can be Regexp objects, in which case their
options will be preserved, or Strings. If no arguments are given, returns
/(?!)/.
Regexp.union #=> /(?!)/ Regexp.union("penzance") #=> /penzance/ Regexp.union("skiing", "sledding") #=> /skiing|sledding/ Regexp.union(/dogs/, /cats/i) #=> /(?-mix:dogs)|(?i-mx:cats)/
static VALUE
rb_reg_s_union(argc, argv)
int argc;
VALUE *argv;
{
if (argc == 0) {
VALUE args[1];
args[0] = rb_str_new2("(?!)");
return rb_class_new_instance(1, args, rb_cRegexp);
}
else if (argc == 1) {
VALUE v;
v = rb_check_convert_type(argv[0], T_REGEXP, "Regexp", "to_regexp");
if (!NIL_P(v))
return v;
else {
VALUE args[1];
args[0] = rb_reg_s_quote(argc, argv);
return rb_class_new_instance(1, args, rb_cRegexp);
}
}
else {
int i, kcode = -1;
VALUE kcode_re = Qnil;
VALUE source = rb_str_buf_new(0);
VALUE args[3];
for (i = 0; i < argc; i++) {
volatile VALUE v;
if (0 < i)
rb_str_buf_cat2(source, "|");
v = rb_check_convert_type(argv[i], T_REGEXP, "Regexp", "to_regexp");
if (!NIL_P(v)) {
if (FL_TEST(v, KCODE_FIXED)) {
if (kcode == -1) {
kcode_re = v;
kcode = RBASIC(v)->flags & KCODE_MASK;
}
else if ((RBASIC(v)->flags & KCODE_MASK) != kcode) {
volatile VALUE str1, str2;
str1 = rb_inspect(kcode_re);
str2 = rb_inspect(v);
rb_raise(rb_eArgError, "mixed kcode: %s and %s",
RSTRING(str1)->ptr, RSTRING(str2)->ptr);
}
}
v = rb_reg_to_s(v);
}
else {
args[0] = argv[i];
v = rb_reg_s_quote(1, args);
}
rb_str_buf_append(source, v);
}
args[0] = source;
args[1] = Qnil;
switch (kcode) {
case -1:
args[2] = Qnil;
break;
case KCODE_NONE:
args[2] = rb_str_new2("n");
break;
case KCODE_EUC:
args[2] = rb_str_new2("e");
break;
case KCODE_SJIS:
args[2] = rb_str_new2("s");
break;
case KCODE_UTF8:
args[2] = rb_str_new2("u");
break;
}
return rb_class_new_instance(3, args, rb_cRegexp);
}
}
Equality—Two regexps are equal if their patterns are identical, they have
the same character set code, and their casefold? values are
the same.
/abc/ == /abc/x #=> false /abc/ == /abc/i #=> false /abc/u == /abc/n #=> false
static VALUE
rb_reg_equal(re1, re2)
VALUE re1, re2;
{
if (re1 == re2) return Qtrue;
if (TYPE(re2) != T_REGEXP) return Qfalse;
rb_reg_check(re1); rb_reg_check(re2);
if (RREGEXP(re1)->len != RREGEXP(re2)->len) return Qfalse;
if (memcmp(RREGEXP(re1)->str, RREGEXP(re2)->str, RREGEXP(re1)->len) == 0 &&
rb_reg_cur_kcode(re1) == rb_reg_cur_kcode(re2) &&
RREGEXP(re1)->ptr->options == RREGEXP(re2)->ptr->options) {
return Qtrue;
}
return Qfalse;
}
Case Equality—Synonym for Regexp#=~ used in case statements.
a = "HELLO" case a when /^[a-z]*$/; print "Lower case\n" when /^[A-Z]*$/; print "Upper case\n" else; print "Mixed case\n" end
produces:
Upper case
VALUE
rb_reg_eqq(re, str)
VALUE re, str;
{
long start;
if (TYPE(str) != T_STRING) {
str = rb_check_string_type(str);
if (NIL_P(str)) {
rb_backref_set(Qnil);
return Qfalse;
}
}
StringValue(str);
start = rb_reg_search(re, str, 0, 0);
if (start < 0) {
return Qfalse;
}
return Qtrue;
}
Returns a MatchData object describing the match, or
nil if there was no match. This is equivalent to retrieving
the value of the special variable $~ following a normal match.
/(.)(.)(.)/.match("abc")[2] #=> "b"
VALUE
rb_reg_match(re, str)
VALUE re, str;
{
long start;
if (NIL_P(str)) {
rb_backref_set(Qnil);
return Qnil;
}
StringValue(str);
start = rb_reg_search(re, str, 0, 0);
if (start < 0) {
return Qnil;
}
return LONG2FIX(start);
}
Returns the value of the case-insensitive flag.
static VALUE
rb_reg_casefold_p(re)
VALUE re;
{
rb_reg_check(re);
if (RREGEXP(re)->ptr->options & RE_OPTION_IGNORECASE) return Qtrue;
return Qfalse;
}
Equality—Two regexps are equal if their patterns are identical, they have
the same character set code, and their casefold? values are
the same.
/abc/ == /abc/x #=> false /abc/ == /abc/i #=> false /abc/u == /abc/n #=> false
static VALUE
rb_reg_equal(re1, re2)
VALUE re1, re2;
{
if (re1 == re2) return Qtrue;
if (TYPE(re2) != T_REGEXP) return Qfalse;
rb_reg_check(re1); rb_reg_check(re2);
if (RREGEXP(re1)->len != RREGEXP(re2)->len) return Qfalse;
if (memcmp(RREGEXP(re1)->str, RREGEXP(re2)->str, RREGEXP(re1)->len) == 0 &&
rb_reg_cur_kcode(re1) == rb_reg_cur_kcode(re2) &&
RREGEXP(re1)->ptr->options == RREGEXP(re2)->ptr->options) {
return Qtrue;
}
return Qfalse;
}
Produce a hash based on the text and options of this regular expression.
static VALUE
rb_reg_hash(re)
VALUE re;
{
int hashval, len;
char *p;
rb_reg_check(re);
hashval = RREGEXP(re)->ptr->options;
len = RREGEXP(re)->len;
p = RREGEXP(re)->str;
while (len--) {
hashval = hashval * 33 + *p++;
}
hashval = hashval + (hashval>>5);
return INT2FIX(hashval);
}
Produce a nicely formatted string-version of rxp. Perhaps
surprisingly, #inspect actually produces the more natural
version of the string than #to_s.
/ab+c/ix.to_s #=> /ab+c/ix
static VALUE
rb_reg_inspect(re)
VALUE re;
{
rb_reg_check(re);
return rb_reg_desc(RREGEXP(re)->str, RREGEXP(re)->len, re);
}
Returns the character set code for the regexp.
static VALUE
rb_reg_kcode_m(re)
VALUE re;
{
char *kcode;
if (FL_TEST(re, KCODE_FIXED)) {
switch (RBASIC(re)->flags & KCODE_MASK) {
case KCODE_NONE:
kcode = "none"; break;
case KCODE_EUC:
kcode = "euc"; break;
case KCODE_SJIS:
kcode = "sjis"; break;
case KCODE_UTF8:
kcode = "utf8"; break;
default:
rb_bug("unknown kcode - should not happen");
break;
}
return rb_str_new2(kcode);
}
return Qnil;
}
Returns a MatchData object describing the match, or
nil if there was no match. This is equivalent to retrieving
the value of the special variable $~ following a normal match.
/(.)(.)(.)/.match("abc")[2] #=> "b"
static VALUE
rb_reg_match_m(re, str)
VALUE re, str;
{
VALUE result = rb_reg_match(re, str);
if (NIL_P(result)) return Qnil;
result = rb_backref_get();
rb_match_busy(result);
return result;
}
Returns the set of bits corresponding to the options used when creating
this Regexp (see Regexp::new for
details. Note that additional bits may be set in the returned options:
these are used internally by the regular expression code. These extra bits
are ignored if the options are passed to Regexp::new.
Regexp::IGNORECASE #=> 1 Regexp::EXTENDED #=> 2 Regexp::MULTILINE #=> 4 /cat/.options #=> 128 /cat/ix.options #=> 131 Regexp.new('cat', true).options #=> 129 Regexp.new('cat', 0, 's').options #=> 384 r = /cat/ix Regexp.new(r.source, r.options) #=> /cat/ix
static VALUE
rb_reg_options_m(re)
VALUE re;
{
int options = rb_reg_options(re);
return INT2NUM(options);
}
Returns the original string of the pattern.
/ab+c/ix.source #=> "ab+c"
static VALUE
rb_reg_source(re)
VALUE re;
{
VALUE str;
rb_reg_check(re);
str = rb_str_new(RREGEXP(re)->str,RREGEXP(re)->len);
if (OBJ_TAINTED(re)) OBJ_TAINT(str);
return str;
}
Returns a string containing the regular expression and its options (using
the (?xxx:yyy) notation. This string can be fed back in to
Regexp::new to a regular expression with the same semantics as
the original. (However, Regexp#== may not return true when
comparing the two, as the source of the regular expression itself may
differ, as the example shows). Regexp#inspect produces a
generally more readable version of rxp.
r1 = /ab+c/ix #=> /ab+c/ix s1 = r1.to_s #=> "(?ix-m:ab+c)" r2 = Regexp.new(s1) #=> /(?ix-m:ab+c)/ r1 == r2 #=> false r1.source #=> "ab+c" r2.source #=> "(?ix-m:ab+c)"
static VALUE
rb_reg_to_s(re)
VALUE re;
{
int options;
const int embeddable = RE_OPTION_MULTILINE|RE_OPTION_IGNORECASE|RE_OPTION_EXTENDED;
long len;
const char* ptr;
VALUE str = rb_str_buf_new2("(?");
rb_reg_check(re);
options = RREGEXP(re)->ptr->options;
ptr = RREGEXP(re)->str;
len = RREGEXP(re)->len;
again:
if (len >= 4 && ptr[0] == '(' && ptr[1] == '?') {
int err = 1;
ptr += 2;
if ((len -= 2) > 0) {
do {
if (*ptr == 'm') {
options |= RE_OPTION_MULTILINE;
}
else if (*ptr == 'i') {
options |= RE_OPTION_IGNORECASE;
}
else if (*ptr == 'x') {
options |= RE_OPTION_EXTENDED;
}
else break;
++ptr;
} while (--len > 0);
}
if (len > 1 && *ptr == '-') {
++ptr;
--len;
do {
if (*ptr == 'm') {
options &= ~RE_OPTION_MULTILINE;
}
else if (*ptr == 'i') {
options &= ~RE_OPTION_IGNORECASE;
}
else if (*ptr == 'x') {
options &= ~RE_OPTION_EXTENDED;
}
else break;
++ptr;
} while (--len > 0);
}
if (*ptr == ')') {
--len;
++ptr;
goto again;
}
if (*ptr == ':' && ptr[len-1] == ')') {
Regexp *rp;
rb_kcode_set_option(re);
rp = ALLOC(Regexp);
MEMZERO((char *)rp, Regexp, 1);
err = re_compile_pattern(++ptr, len -= 2, rp) != 0;
rb_kcode_reset_option();
re_free_pattern(rp);
}
if (err) {
options = RREGEXP(re)->ptr->options;
ptr = RREGEXP(re)->str;
len = RREGEXP(re)->len;
}
}
if (options & RE_OPTION_MULTILINE) rb_str_buf_cat2(str, "m");
if (options & RE_OPTION_IGNORECASE) rb_str_buf_cat2(str, "i");
if (options & RE_OPTION_EXTENDED) rb_str_buf_cat2(str, "x");
if ((options & embeddable) != embeddable) {
rb_str_buf_cat2(str, "-");
if (!(options & RE_OPTION_MULTILINE)) rb_str_buf_cat2(str, "m");
if (!(options & RE_OPTION_IGNORECASE)) rb_str_buf_cat2(str, "i");
if (!(options & RE_OPTION_EXTENDED)) rb_str_buf_cat2(str, "x");
}
rb_str_buf_cat2(str, ":");
rb_reg_expr_str(str, ptr, len);
rb_str_buf_cat2(str, ")");
OBJ_INFECT(str, re);
return str;
}
Match—Matches rxp against the contents of $_.
Equivalent to rxp =~ $_.
$_ = "input data" ~ /at/ #=> 7
VALUE
rb_reg_match2(re)
VALUE re;
{
long start;
VALUE line = rb_lastline_get();
if (TYPE(line) != T_STRING) {
rb_backref_set(Qnil);
return Qnil;
}
start = rb_reg_search(re, line, 0, 0);
if (start < 0) {
return Qnil;
}
return LONG2FIX(start);
}