A Regexp
holds a regular expression, used to match a pattern
against strings. Regexps are created using the /.../
and
%r{...}
literals, and by the Regexp::new
constructor.
Escapes any characters that would have special meaning in a regular
expression. Returns a new escaped string, or self if no characters are
escaped. For any string,
Regexp.new(Regexp.escape(str))=~str
will be
true.
Regexp.escape('\*?{}.') #=> \\\*\?\{\}\.
static VALUE rb_reg_s_quote(VALUE c, VALUE str) { return rb_reg_quote(reg_operand(str, Qtrue)); }
The first form returns the MatchData
object generated by the
last successful pattern match. Equivalent to reading the global variable
$~
. The second form returns the nth field in this
MatchData
object. n can be a string or symbol to
reference a named capture.
/c(.)t/ =~ 'cat' #=> 0 Regexp.last_match #=> #<MatchData "cat" 1:"a"> Regexp.last_match(0) #=> "cat" Regexp.last_match(1) #=> "a" Regexp.last_match(2) #=> nil /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ "var = val" Regexp.last_match #=> #<MatchData "var = val" lhs:"var" rhs:"val"> Regexp.last_match(:lhs) #=> "var" Regexp.last_match(:rhs) #=> "val"
static VALUE rb_reg_s_last_match(int argc, VALUE *argv) { VALUE nth; if (argc > 0 && rb_scan_args(argc, argv, "01", &nth) == 1) { VALUE match = rb_backref_get(); int n; if (NIL_P(match)) return Qnil; n = match_backref_number(match, nth); return rb_reg_nth_match(n, match); } return match_getter(); }
Constructs a new regular expression from pattern, which can be
either a String
or a Regexp
(in which case that
regexp's options are propagated, and new options may not be specified
(a change as of Ruby 1.8). If options is a Fixnum
, it
should be one or more of the constants Regexp::EXTENDED
,
Regexp::IGNORECASE
, and Regexp::MULTILINE
,
or-ed together. Otherwise, if options is not
nil
, the regexp will be case insensitive.
r1 = Regexp.new('^a-z+:\\s+\w+') #=> /^a-z+:\s+\w+/ r2 = Regexp.new('cat', true) #=> /cat/i r3 = Regexp.new('dog', Regexp::EXTENDED) #=> /dog/x r4 = Regexp.new(r2) #=> /cat/i
static VALUE rb_reg_initialize_m(int argc, VALUE *argv, VALUE self) { onig_errmsg_buffer err = ""; int flags = 0; VALUE str; rb_encoding *enc; const char *ptr; long len; if (argc == 0 || argc > 3) { rb_raise(rb_eArgError, "wrong number of arguments"); } if (TYPE(argv[0]) == T_REGEXP) { VALUE re = argv[0]; if (argc > 1) { rb_warn("flags ignored"); } rb_reg_check(re); flags = rb_reg_options(re); ptr = RREGEXP_SRC_PTR(re); len = RREGEXP_SRC_LEN(re); enc = rb_enc_get(re); if (rb_reg_initialize(self, ptr, len, enc, flags, err)) { str = rb_enc_str_new(ptr, len, enc); rb_reg_raise_str(str, flags, err); } } else { if (argc >= 2) { if (FIXNUM_P(argv[1])) flags = FIX2INT(argv[1]); else if (RTEST(argv[1])) flags = ONIG_OPTION_IGNORECASE; } enc = 0; if (argc == 3 && !NIL_P(argv[2])) { char *kcode = StringValuePtr(argv[2]); if (kcode[0] == 'n' || kcode[1] == 'N') { enc = rb_ascii8bit_encoding(); flags |= ARG_ENCODING_NONE; } else { rb_warn("encoding option is ignored - %s", kcode); } } str = argv[0]; ptr = StringValuePtr(str); if (enc ? rb_reg_initialize(self, ptr, RSTRING_LEN(str), enc, flags, err) : rb_reg_initialize_str(self, str, flags, err)) { rb_reg_raise_str(str, flags, err); } } return self; }
Escapes any characters that would have special meaning in a regular
expression. Returns a new escaped string, or self if no characters are
escaped. For any string,
Regexp.new(Regexp.escape(str))=~str
will be
true.
Regexp.escape('\*?{}.') #=> \\\*\?\{\}\.
static VALUE rb_reg_s_quote(VALUE c, VALUE str) { return rb_reg_quote(reg_operand(str, Qtrue)); }
Try to convert obj into a Regexp, using to_regexp method. Returns converted regexp or nil if obj cannot be converted for any reason.
Regexp.try_convert(/re/) #=> /re/ Regexp.try_convert("re") #=> nil o = Object.new Regexp.try_convert(o) #=> nil def o.to_regexp() /foo/ end Regexp.try_convert(o) #=> /foo/
static VALUE rb_reg_s_try_convert(VALUE dummy, VALUE re) { return rb_check_regexp_type(re); }
Return a Regexp
object that is the union of the given
patterns, i.e., will match any of its parts. The patterns
can be Regexp objects, in which case their
options will be preserved, or Strings. If no patterns are given, returns
/(?!)/
.
Regexp.union #=> /(?!)/ Regexp.union("penzance") #=> /penzance/ Regexp.union("a+b*c") #=> /a\+b\*c/ Regexp.union("skiing", "sledding") #=> /skiing|sledding/ Regexp.union(["skiing", "sledding"]) #=> /skiing|sledding/ Regexp.union(/dogs/, /cats/i) #=> /(?-mix:dogs)|(?i-mx:cats)/
static VALUE rb_reg_s_union_m(VALUE self, VALUE args) { VALUE v; if (RARRAY_LEN(args) == 1 && !NIL_P(v = rb_check_array_type(rb_ary_entry(args, 0)))) { return rb_reg_s_union(self, v); } return rb_reg_s_union(self, args); }
Equality—Two regexps are equal if their patterns are identical, they have
the same character set code, and their casefold?
values are
the same.
/abc/ == /abc/x #=> false /abc/ == /abc/i #=> false /abc/ == /abc/n #=> false /abc/u == /abc/n #=> false
static VALUE rb_reg_equal(VALUE re1, VALUE re2) { if (re1 == re2) return Qtrue; if (TYPE(re2) != T_REGEXP) return Qfalse; rb_reg_check(re1); rb_reg_check(re2); if (FL_TEST(re1, KCODE_FIXED) != FL_TEST(re2, KCODE_FIXED)) return Qfalse; if (RREGEXP(re1)->ptr->options != RREGEXP(re2)->ptr->options) return Qfalse; if (RREGEXP_SRC_LEN(re1) != RREGEXP_SRC_LEN(re2)) return Qfalse; if (ENCODING_GET(re1) != ENCODING_GET(re2)) return Qfalse; if (memcmp(RREGEXP_SRC_PTR(re1), RREGEXP_SRC_PTR(re2), RREGEXP_SRC_LEN(re1)) == 0) { return Qtrue; } return Qfalse; }
Case Equality—Synonym for Regexp#=~
used in case statements.
a = "HELLO" case a when /^[a-z]*$/; print "Lower case\n" when /^[A-Z]*$/; print "Upper case\n" else; print "Mixed case\n" end
produces:
Upper case
VALUE rb_reg_eqq(VALUE re, VALUE str) { long start; str = reg_operand(str, Qfalse); if (NIL_P(str)) { rb_backref_set(Qnil); return Qfalse; } start = rb_reg_search(re, str, 0, 0); if (start < 0) { return Qfalse; } return Qtrue; }
Match—Matches rxp against str.
/at/ =~ "input data" #=> 7 /ax/ =~ "input data" #=> nil
If =~
is used with a regexp literal with named captures,
captured strings (or nil) is assigned to local variables named by the
capture names.
/(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ " x = y " p lhs #=> "x" p rhs #=> "y"
If it is not matched, nil is assigned for the variables.
/(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ " x = " p lhs #=> nil p rhs #=> nil
This assignment is implemented in the Ruby parser. The parser detects 'regexp-literal =~ expression' for the assignment. The regexp must be a literal without interpolation and placed at left hand side.
The assignment is not occur if the regexp is not a literal.
re = /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ re =~ " x = y " p lhs # undefined local variable p rhs # undefined local variable
A regexp interpolation, #{}
, also disables the assignment.
rhs_pat = /(?<rhs>\w+)/ /(?<lhs>\w+)\s*=\s*#{rhs_pat}/ =~ "x = y" p lhs # undefined local variable
The assignment is not occur if the regexp is placed at right hand side.
" x = y " =~ /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ p lhs, rhs # undefined local variable
VALUE rb_reg_match(VALUE re, VALUE str) { long pos = reg_match_pos(re, &str, 0); if (pos < 0) return Qnil; pos = rb_str_sublen(str, pos); return LONG2FIX(pos); }
Returns the value of the case-insensitive flag.
/a/.casefold? #=> false /a/i.casefold? #=> true /(?i:a)/.casefold? #=> false
static VALUE rb_reg_casefold_p(VALUE re) { rb_reg_check(re); if (RREGEXP(re)->ptr->options & ONIG_OPTION_IGNORECASE) return Qtrue; return Qfalse; }
Returns the Encoding object that represents the encoding of obj.
VALUE rb_obj_encoding(VALUE obj) { rb_encoding *enc = rb_enc_get(obj); if (!enc) { rb_raise(rb_eTypeError, "unknown encoding"); } return rb_enc_from_encoding(enc); }
Equality—Two regexps are equal if their patterns are identical, they have
the same character set code, and their casefold?
values are
the same.
/abc/ == /abc/x #=> false /abc/ == /abc/i #=> false /abc/ == /abc/n #=> false /abc/u == /abc/n #=> false
static VALUE rb_reg_equal(VALUE re1, VALUE re2) { if (re1 == re2) return Qtrue; if (TYPE(re2) != T_REGEXP) return Qfalse; rb_reg_check(re1); rb_reg_check(re2); if (FL_TEST(re1, KCODE_FIXED) != FL_TEST(re2, KCODE_FIXED)) return Qfalse; if (RREGEXP(re1)->ptr->options != RREGEXP(re2)->ptr->options) return Qfalse; if (RREGEXP_SRC_LEN(re1) != RREGEXP_SRC_LEN(re2)) return Qfalse; if (ENCODING_GET(re1) != ENCODING_GET(re2)) return Qfalse; if (memcmp(RREGEXP_SRC_PTR(re1), RREGEXP_SRC_PTR(re2), RREGEXP_SRC_LEN(re1)) == 0) { return Qtrue; } return Qfalse; }
Returns false if rxp is applicable to a string with any ASCII compatible encoding. Returns true otherwise.
r = /a/ r.fixed_encoding? #=> false r =~ "\u{6666} a" #=> 2 r =~ "\xa1\xa2 a".force_encoding("euc-jp") #=> 2 r =~ "abc".force_encoding("euc-jp") #=> 0 r = /a/u r.fixed_encoding? #=> true r.encoding #=> #<Encoding:UTF-8> r =~ "\u{6666} a" #=> 2 r =~ "\xa1\xa2".force_encoding("euc-jp") #=> ArgumentError r =~ "abc".force_encoding("euc-jp") #=> 0 r = /\u{6666}/ r.fixed_encoding? #=> true r.encoding #=> #<Encoding:UTF-8> r =~ "\u{6666} a" #=> 0 r =~ "\xa1\xa2".force_encoding("euc-jp") #=> ArgumentError r =~ "abc".force_encoding("euc-jp") #=> nil
static VALUE rb_reg_fixed_encoding_p(VALUE re) { if (FL_TEST(re, KCODE_FIXED)) return Qtrue; else return Qfalse; }
Produce a hash based on the text and options of this regular expression.
static VALUE rb_reg_hash(VALUE re) { int hashval, len; char *p; rb_reg_check(re); hashval = RREGEXP(re)->ptr->options; len = RREGEXP_SRC_LEN(re); p = RREGEXP_SRC_PTR(re); while (len--) { hashval = hashval * 33 + *p++; } hashval = hashval + (hashval>>5); return INT2FIX(hashval); }
Produce a nicely formatted string-version of rxp. Perhaps
surprisingly, #inspect
actually produces the more natural
version of the string than #to_s
.
/ab+c/ix.inspect #=> "/ab+c/ix"
static VALUE rb_reg_inspect(VALUE re) { if (!RREGEXP(re)->ptr || !RREGEXP_SRC(re) || !RREGEXP_SRC_PTR(re)) { return rb_any_to_s(re); } return rb_reg_desc(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re), re); }
Returns a MatchData
object describing the match, or
nil
if there was no match. This is equivalent to retrieving
the value of the special variable $~
following a normal match.
If the second parameter is present, it specifies the position in the string
to begin the search.
/(.)(.)(.)/.match("abc")[2] #=> "b" /(.)(.)/.match("abc", 1)[2] #=> "c"
If a block is given, invoke the block with MatchData if match succeed, so that you can write
pat.match(str) {|m| ...}
instead of
if m = pat.match(str) ... end
The return value is a value from block execution in this case.
static VALUE rb_reg_match_m(int argc, VALUE *argv, VALUE re) { VALUE result, str, initpos; long pos; if (rb_scan_args(argc, argv, "11", &str, &initpos) == 2) { pos = NUM2LONG(initpos); } else { pos = 0; } pos = reg_match_pos(re, &str, pos); if (pos < 0) { rb_backref_set(Qnil); return Qnil; } result = rb_backref_get(); rb_match_busy(result); if (!NIL_P(result) && rb_block_given_p()) { return rb_yield(result); } return result; }
Returns a hash representing information about named captures of rxp.
A key of the hash is a name of the named captures. A value of the hash is an array which is list of indexes of corresponding named captures.
/(?<foo>.)(?<bar>.)/.named_captures #=> {"foo"=>[1], "bar"=>[2]} /(?<foo>.)(?<foo>.)/.named_captures #=> {"foo"=>[1, 2]}
If there are no named captures, an empty hash is returned.
/(.)(.)/.named_captures #=> {}
static VALUE rb_reg_named_captures(VALUE re) { VALUE hash = rb_hash_new(); rb_reg_check(re); onig_foreach_name(RREGEXP(re)->ptr, reg_named_captures_iter, (void*)hash); return hash; }
Returns a list of names of captures as an array of strings.
/(?<foo>.)(?<bar>.)(?<baz>.)/.names #=> ["foo", "bar", "baz"] /(?<foo>.)(?<foo>.)/.names #=> ["foo"] /(.)(.)/.names #=> []
static VALUE rb_reg_names(VALUE re) { VALUE ary = rb_ary_new(); rb_reg_check(re); onig_foreach_name(RREGEXP(re)->ptr, reg_names_iter, (void*)ary); return ary; }
Returns the set of bits corresponding to the options used when creating
this Regexp (see Regexp::new
for
details. Note that additional bits may be set in the returned options:
these are used internally by the regular expression code. These extra bits
are ignored if the options are passed to Regexp::new
.
Regexp::IGNORECASE #=> 1 Regexp::EXTENDED #=> 2 Regexp::MULTILINE #=> 4 /cat/.options #=> 0 /cat/ix.options #=> 3 Regexp.new('cat', true).options #=> 1 /\xa1\xa2/e.options #=> 16 r = /cat/ix Regexp.new(r.source, r.options) #=> /cat/ix
static VALUE rb_reg_options_m(VALUE re) { int options = rb_reg_options(re); return INT2NUM(options); }
Returns the original string of the pattern.
/ab+c/ix.source #=> "ab+c"
Note that escape sequences are retained as is.
/\x20\+/.source #=> "\\x20\\+"
static VALUE rb_reg_source(VALUE re) { VALUE str; rb_reg_check(re); str = rb_enc_str_new(RREGEXP_SRC_PTR(re),RREGEXP_SRC_LEN(re), rb_enc_get(re)); if (OBJ_TAINTED(re)) OBJ_TAINT(str); return str; }
Returns a string containing the regular expression and its options (using
the (?opts:source)
notation. This string can be fed back in to
Regexp::new
to a regular expression with the same semantics as
the original. (However, Regexp#==
may not return true when
comparing the two, as the source of the regular expression itself may
differ, as the example shows). Regexp#inspect
produces a
generally more readable version of rxp.
r1 = /ab+c/ix #=> /ab+c/ix s1 = r1.to_s #=> "(?ix-m:ab+c)" r2 = Regexp.new(s1) #=> /(?ix-m:ab+c)/ r1 == r2 #=> false r1.source #=> "ab+c" r2.source #=> "(?ix-m:ab+c)"
static VALUE rb_reg_to_s(VALUE re) { int options, opt; const int embeddable = ONIG_OPTION_MULTILINE|ONIG_OPTION_IGNORECASE|ONIG_OPTION_EXTEND; long len; const UChar* ptr; VALUE str = rb_str_buf_new2("(?"); char optbuf[5]; rb_reg_check(re); rb_enc_copy(str, re); options = RREGEXP(re)->ptr->options; ptr = (UChar*)RREGEXP_SRC_PTR(re); len = RREGEXP_SRC_LEN(re); again: if (len >= 4 && ptr[0] == '(' && ptr[1] == '?') { int err = 1; ptr += 2; if ((len -= 2) > 0) { do { opt = char_to_option((int )*ptr); if (opt != 0) { options |= opt; } else { break; } ++ptr; } while (--len > 0); } if (len > 1 && *ptr == '-') { ++ptr; --len; do { opt = char_to_option((int )*ptr); if (opt != 0) { options &= ~opt; } else { break; } ++ptr; } while (--len > 0); } if (*ptr == ')') { --len; ++ptr; goto again; } if (*ptr == ':' && ptr[len-1] == ')') { int r; Regexp *rp; r = onig_alloc_init(&rp, ONIG_OPTION_DEFAULT, ONIGENC_CASE_FOLD_DEFAULT, rb_enc_get(re), OnigDefaultSyntax); if (r == 0) { ++ptr; len -= 2; err = (onig_compile(rp, ptr, ptr + len, NULL) != 0); } onig_free(rp); } if (err) { options = RREGEXP(re)->ptr->options; ptr = (UChar*)RREGEXP_SRC_PTR(re); len = RREGEXP_SRC_LEN(re); } } if (*option_to_str(optbuf, options)) rb_str_buf_cat2(str, optbuf); if ((options & embeddable) != embeddable) { optbuf[0] = '-'; option_to_str(optbuf + 1, ~options); rb_str_buf_cat2(str, optbuf); } rb_str_buf_cat2(str, ":"); rb_reg_expr_str(str, (char*)ptr, len); rb_str_buf_cat2(str, ")"); rb_enc_copy(str, re); OBJ_INFECT(str, re); return str; }
Match—Matches rxp against the contents of $_
.
Equivalent to rxp =~ $_
.
$_ = "input data" ~ /at/ #=> 7
VALUE rb_reg_match2(VALUE re) { long start; VALUE line = rb_lastline_get(); if (TYPE(line) != T_STRING) { rb_backref_set(Qnil); return Qnil; } start = rb_reg_search(re, line, 0, 0); if (start < 0) { return Qnil; } start = rb_str_sublen(line, start); return LONG2FIX(start); }