//! Safe wrapper for [POSIX regular expressions API][regex-h] (provided by libc on POSIX-compliant OSes). //! //! [regex-h]: https://pubs.opengroup.org/onlinepubs/9699919799.2008edition/basedefs/regex.h.html#tag_13_37 //! //! ``` //! use regex_rs::*; //! //! let pattern = "This( often)? repeats time and again(, and again)*\\."; //! let compilation_flags = CompFlags::EXTENDED; //! let regex = Regex::new(pattern, compilation_flags) //! .expect("Failed to compile pattern as POSIX extended regular expression"); //! //! let input = "This repeats time and again, and again, and again."; //! // We're only interested in the first match, i.e. the part of text //! // that's matched by the whole regex //! let max_matches = 1; //! let match_flags = MatchFlags::empty(); //! let matches = regex //! .matches(input, max_matches, match_flags) //! .expect("Error matching input against regex"); //! //! // Found a match //! assert_eq!(matches.len(), 1); //! //! // Match spans from the beginning to the end of the input //! assert_eq!(matches[0].start_pos, 0); //! // `end_pos` holds one-past-the-end index //! assert_eq!(matches[0].end_pos, input.len()); //! ``` // This lint is nitpicky, I don't think it's really important how the literals are written. #![allow(clippy::unreadable_literal)] use bitflags::bitflags; use gettextrs::gettext; use libc::{regcomp, regerror, regex_t, regexec, regfree, regmatch_t}; use std::ffi::{CString, OsString}; use std::mem; use std::os::unix::ffi::OsStringExt; use std::ptr; use strprintf::fmt; /// POSIX regular expression. pub struct Regex { /// Compiled POSIX regular expression. regex: regex_t, } bitflags! { /// Compilation flags. /// /// These affect what features are available inside the regex, and also how it's matched /// against the input string. pub struct CompFlags: i32 { /// Use Extended Regular Expressions. /// /// POSIX calls this `REG_EXTENDED`. const EXTENDED = libc::REG_EXTENDED; /// Ignore case when matching. /// /// POSIX calls this `REG_ICASE`. const IGNORE_CASE = libc::REG_ICASE; /// Report only success or fail of the compilation. /// /// POSIX calls this `REG_NOSUB`. const NO_SUB = libc::REG_NOSUB; /// Give special meaning to newline characters. /// /// POSIX calls this `REG_NEWLINE`. /// /// Without this flag, newlines match themselves. /// /// With this flag, newlines match themselves except: /// /// 1. newline is not matched by `.` outside of bracket expressions or by any form of /// non-matching lists; /// /// 2. beginning-of-line (`^`) matches zero-width string right after newline, regardless of /// `CompFlag::NOTBOL`; /// /// 3. end-of-line (`$`) matches zero-width string right before a newline, regardless of /// `CompFlags::NOTEOL`. const NEWLINE = libc::REG_NEWLINE; } } bitflags! { /// Matching flags. /// /// These affect how regex is matched against the input string. pub struct MatchFlags: i32 { /// The circumflex character (`^`), when taken as a special character, does not match the /// beginning of string. const NOTBOL = libc::REG_NOTBOL; /// The dollar-sign (`$`), when taken as a special character, does not match the end of /// string. const NOTEOL = libc::REG_NOTEOL; } } /// Start and end positions of a matched substring. pub struct Match { /// Start position (counting from zero). pub start_pos: usize, /// One-past-end position (counting from zero). pub end_pos: usize, } /// A wrapper around `libc::regerror()`. unsafe fn regex_error_to_str(errcode: libc::c_int, regex: ®ex_t) -> Option { // Find out the size of the buffer needed to hold the error message let errmsg_length = regerror(errcode, regex, ptr::null_mut(), 0); // Allocate the buffer and get the message. // Casting u64 to usize is safe since the error message is unlikely to hit usize's // upper bound. let mut errmsg: Vec = vec![0; errmsg_length as usize]; // Casting `*mut u8` to `*mut c_char` should be safe since C doesn't really care: // it can store any ASCII symbol in a `char`, disregarding signedness. regerror( errcode, regex, errmsg.as_mut_ptr() as *mut std::os::raw::c_char, errmsg_length, ); // Drop the trailing NUL byte that C uses to terminate strings errmsg.pop(); OsString::from_vec(errmsg).into_string().ok() } impl Regex { /// Compiles pattern as a regular expression. /// /// By default, pattern is assumed to be a basic regular expression. To interpret it as an /// extended regular expression, add `CompFlags::EXTENDED` to the `flags`. See also other /// `CompFlags` values to control some other aspects of the regex. /// /// # Returns /// /// Compiled regex or an error message. pub fn new(pattern: &str, flags: CompFlags) -> Result { let pattern = CString::new(pattern) .map_err(|_| String::from("Regular expression contains NUL byte"))?; unsafe { let mut regex: regex_t = mem::zeroed(); let errcode = regcomp(&mut regex, pattern.into_raw(), flags.bits()); if errcode == 0 { Ok(Regex { regex }) } else { match regex_error_to_str(errcode, ®ex) { Some(regcomp_errmsg) => { let msg = fmt!(&gettext("regcomp returned code %i"), errcode); let msg = format!("{}: {}", msg, regcomp_errmsg); Err(msg) } None => Err(fmt!(&gettext("regcomp returned code %i"), errcode)), } } } } /// Matches input string against regex, looking for up to `max_matches` matches. /// /// Regexes can contain parenthesized subexpressions. This method will return up to /// `max_matches`-1 of those. First match is reserved for the text that the whole regex /// matched. /// /// `flags` dictate how matching is performed. See `MatchFlags` for details. /// /// # Returns /// /// - `Ok` with an empty vector if no match found, or if `max_matches` is 0 and there were no /// errors. /// - `Ok` with a non-empty vector if `max_matches` was non-zero and a match was found. First /// element of the vector is the text that regex as a whole matched. The rest of the elements /// are pieces of text that were matched by parenthesized subexpressions. /// - `Err` with an error message. pub fn matches( &self, input: &str, max_matches: usize, flags: MatchFlags, ) -> Result, String> { let input = CString::new(input).map_err(|_| String::from("Input string contains NUL byte"))?; let mut pmatch: Vec; let errcode = unsafe { pmatch = vec![mem::zeroed(); max_matches]; regexec( &self.regex, input.into_raw(), max_matches as libc::size_t, pmatch.as_mut_ptr(), flags.bits(), ) }; match errcode { 0 => { // Success. Let's copy results let mut matches: Vec = Vec::new(); for m in pmatch { if m.rm_so < 0 || m.rm_eo < 0 { // Since `max_matches` can be bigger than the number of parenthesized // blocks in the regex, it's possible that some of the `pmatch` values are // empty. We exit the loop after detecting first such value. break; } // It's safe to cast i32 to usize here: // - we already checked that the values aren't negative // - usize's upper bound is higher than i32's matches.push(Match { start_pos: m.rm_so as usize, end_pos: m.rm_eo as usize, }); } Ok(matches) } libc::REG_NOMATCH => { // Matching went okay, but nothing found Ok(Vec::new()) } // POSIX only specifies two return codes for regexec(), but implementations are free to // extend that. _ => unsafe { match regex_error_to_str(errcode, &self.regex) { Some(regexec_errmsg) => { let msg = fmt!(&gettext("regexec returned code %i"), errcode); let msg = format!("{}: {}", msg, regexec_errmsg); Err(msg) } None => Err(fmt!(&gettext("regexec returned code %i"), errcode)), } }, } } } impl Drop for Regex { fn drop(&mut self) { unsafe { regfree(&mut self.regex); } } } #[cfg(test)] mod tests { use super::*; #[test] fn matches_basic_posix_regular_expression() { let regex = Regex::new("abc+", CompFlags::empty()).unwrap(); let matches = regex.matches("abc+others", 1, MatchFlags::empty()).unwrap(); assert_eq!(matches.len(), 1); // Match spans from the start of the input until the 4th character assert_eq!(matches[0].start_pos, 0); assert_eq!(matches[0].end_pos, 4); // one-past-last offset } #[test] fn matches_extended_posix_regular_expression() { let regex = Regex::new("aBc+", CompFlags::EXTENDED | CompFlags::IGNORE_CASE).unwrap(); let matches = regex .matches("AbCcCcCC and others", 1, MatchFlags::empty()) .unwrap(); assert_eq!(matches.len(), 1); // Match spans from the start of the input until the 4th character assert_eq!(matches[0].start_pos, 0); assert_eq!(matches[0].end_pos, 8); // one-past-last offset } #[test] fn returns_empty_when_regex_valid_but_no_match() { let regex = Regex::new("abc", CompFlags::empty()).unwrap(); let matches = regex.matches("cba", 1, MatchFlags::empty()).unwrap(); assert_eq!(matches.len(), 0); } #[test] fn returns_no_more_results_than_max_matches() { let regex = Regex::new("(a)(b)(c)", CompFlags::EXTENDED).unwrap(); let max_matches = 2; let matches = regex .matches("abc", max_matches, MatchFlags::empty()) .unwrap(); assert_eq!(matches.len(), 2); assert_eq!(matches[0].start_pos, 0); assert_eq!(matches[0].end_pos, 3); assert_eq!(matches[1].start_pos, 0); assert_eq!(matches[1].end_pos, 1); } #[test] fn returns_no_more_results_than_available() { let regex = Regex::new("abc", CompFlags::EXTENDED).unwrap(); let max_matches = 10; let matches = regex .matches("abc", max_matches, MatchFlags::empty()) .unwrap(); assert_eq!(matches.len(), 1); assert_eq!(matches[0].start_pos, 0); assert_eq!(matches[0].end_pos, 3); } #[test] fn new_returns_error_on_invalid_regex() { let result = Regex::new("(abc", CompFlags::EXTENDED); assert!(result.is_err()); if let Err(msg) = result { // There should be at least an error code, so string can't possibly be empty assert!(!msg.is_empty()); // The message shouldn't contain a C string terminator (NUL) at the end assert!(!msg.ends_with('\0')); } } }