// Copyright (c) 2017, Daniel Martí // See LICENSE for licensing information // Package pattern allows working with shell pattern matching notation, also // known as wildcards or globbing. // // For reference, see // https://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html#tag_18_13. package pattern import ( "fmt" "io" "regexp" "strings" "unicode/utf8" ) // Mode can be used to supply a number of options to the package's functions. // Not all functions change their behavior with all of the options below. type Mode uint type SyntaxError struct { msg string err error } func (e SyntaxError) Error() string { return e.msg } func (e SyntaxError) Unwrap() error { return e.err } // TODO(v4): flip NoGlobStar to be opt-in via GlobStar, matching bash // TODO(v4): flip EntireString to be opt-out via PartialMatch, as EntireString causes subtle bugs when forgotten // TODO(v4): rename NoGlobCase to CaseInsensitive for readability const ( Shortest Mode = 1 << iota // prefer the shortest match. Filenames // "*" and "?" don't match slashes; only "**" does EntireString // match the entire string using ^$ delimiters NoGlobCase // Do case-insensitive match (that is, use (?i) in the regexp) NoGlobStar // Do not support "**" ) // Regexp turns a shell pattern into a regular expression that can be used with // [regexp.Compile]. It will return an error if the input pattern was incorrect. // Otherwise, the returned expression can be passed to [regexp.MustCompile]. // // For example, Regexp(`foo*bar?`, true) returns `foo.*bar.`. // // Note that this function (and [QuoteMeta]) should not be directly used with file // paths if Windows is supported, as the path separator on that platform is the // same character as the escaping character for shell patterns. func Regexp(pat string, mode Mode) (string, error) { // If there are no special pattern matching or regular expression characters, // and we don't need to insert extras for the modes affecting non-special characters, // we can directly return the input string as a short-cut. if mode&(EntireString|NoGlobCase) == 0 { needsEscaping := false noopLoop: for _, r := range pat { switch r { // including those that need escaping since they are // regular expression metacharacters case '*', '?', '[', '\\', '.', '+', '(', ')', '|', ']', '{', '}', '^', '$': needsEscaping = true break noopLoop } } if !needsEscaping { return pat, nil } } var sb strings.Builder // Enable matching `\n` with the `.` metacharacter as globs match `\n` sb.WriteString("(?s") if mode&NoGlobCase != 0 { sb.WriteString("i") } if mode&Shortest != 0 { sb.WriteString("U") } sb.WriteString(")") if mode&EntireString != 0 { sb.WriteString("^") } sl := stringLexer{s: pat} for { if err := regexpNext(&sb, &sl, mode); err == io.EOF { break } else if err != nil { return "", err } } if mode&EntireString != 0 { sb.WriteString("$") } return sb.String(), nil } // stringLexer helps us tokenize a pattern string. // Note that we can use the null byte '\x00' to signal "no character" as shell strings cannot contain null bytes. // TODO: should the tokenization be based on runes? e.g: [á-é] type stringLexer struct { s string i int } func (sl *stringLexer) next() byte { if sl.i >= len(sl.s) { return '\x00' } c := sl.s[sl.i] sl.i++ return c } func (sl *stringLexer) last() byte { if sl.i < 2 { return '\x00' } return sl.s[sl.i-2] } func (sl *stringLexer) peekNext() byte { if sl.i >= len(sl.s) { return '\x00' } return sl.s[sl.i] } func (sl *stringLexer) peekRest() string { return sl.s[sl.i:] } func regexpNext(sb *strings.Builder, sl *stringLexer, mode Mode) error { switch c := sl.next(); c { case '\x00': return io.EOF case '*': if mode&Filenames == 0 { // * - matches anything when not in filename mode sb.WriteString(".*") break } // "**" only acts as globstar if it is alone as a path element. singleBefore := sl.i == 1 || sl.last() == '/' if sl.peekNext() == '*' { sl.i++ singleAfter := sl.i == len(sl.s) || sl.peekNext() == '/' if mode&NoGlobStar == 0 && singleBefore && singleAfter { if sl.peekNext() == '/' { // **/ - like "**" but requiring a trailing slash when matching sl.i++ sb.WriteString("((/|[^/.][^/]*)*/)?") } else { // ** - match any number of slashes or "*" path elements sb.WriteString("(/|[^/.][^/]*)*") } break } // foo**, **bar, or NoGlobStar - behaves like "*" below } // * - matches anything except slashes and leading dots if singleBefore { sb.WriteString("([^/.][^/]*)?") } else { sb.WriteString("[^/]*") } case '?': if mode&Filenames != 0 { sb.WriteString("[^/]") } else { sb.WriteByte('.') } case '\\': c = sl.next() if c == '\x00' { return &SyntaxError{msg: `\ at end of pattern`} } sb.WriteString(regexp.QuoteMeta(string(c))) case '[': // TODO: surely char classes can be mixed with others, e.g. [[:foo:]xyz] if name, err := charClass(sl.peekRest()); err != nil { return &SyntaxError{msg: "charClass invalid", err: err} } else if name != "" { sb.WriteByte('[') sb.WriteString(name) sl.i += len(name) break } if mode&Filenames != 0 { for _, c := range sl.peekRest() { if c == ']' { break } else if c == '/' { sb.WriteString("\\[") return nil } } } sb.WriteByte(c) if c = sl.next(); c == '\x00' { return &SyntaxError{msg: "[ was not matched with a closing ]"} } switch c { case '!', '^': sb.WriteByte('^') if c = sl.next(); c == '\x00' { return &SyntaxError{msg: "[ was not matched with a closing ]"} } } if c == ']' { sb.WriteByte(']') if c = sl.next(); c == '\x00' { return &SyntaxError{msg: "[ was not matched with a closing ]"} } } for { sb.WriteByte(c) switch c { case '\x00': return &SyntaxError{msg: "[ was not matched with a closing ]"} case '\\': if c = sl.next(); c != '0' { sb.WriteByte(c) } case '-': start := sl.last() end := sl.peekNext() // TODO: what about overlapping ranges, like: [a--z] if end != ']' && start > end { return &SyntaxError{msg: fmt.Sprintf("invalid range: %c-%c", start, end)} } case ']': return nil } c = sl.next() } default: if c > utf8.RuneSelf { sb.WriteByte(c) } else { sb.WriteString(regexp.QuoteMeta(string(c))) } } return nil } func charClass(s string) (string, error) { if strings.HasPrefix(s, "[.") || strings.HasPrefix(s, "[=") { return "", fmt.Errorf("collating features not available") } name, ok := strings.CutPrefix(s, "[:") if !ok { return "", nil } name, _, ok = strings.Cut(name, ":]]") if !ok { return "", fmt.Errorf("[[: was not matched with a closing :]]") } switch name { case "alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph", "lower", "print", "punct", "space", "upper", "word", "xdigit": default: return "", fmt.Errorf("invalid character class: %q", name) } return s[:len(name)+5], nil } // HasMeta returns whether a string contains any unescaped pattern // metacharacters: '*', '?', or '['. When the function returns false, the given // pattern can only match at most one string. // // For example, HasMeta(`foo\*bar`) returns false, but HasMeta(`foo*bar`) // returns true. // // This can be useful to avoid extra work, like [Regexp]. Note that this // function cannot be used to avoid [QuoteMeta], as backslashes are quoted by // that function but ignored here. // // The [Mode] parameter is unused, and will be removed in v4. func HasMeta(pat string, mode Mode) bool { for i := 0; i < len(pat); i++ { switch pat[i] { case '\\': i++ case '*', '?', '[': return true } } return false } // QuoteMeta returns a string that quotes all pattern metacharacters in the // given text. The returned string is a pattern that matches the literal text. // // For example, QuoteMeta(`foo*bar?`) returns `foo\*bar\?`. // // The [Mode] parameter is unused, and will be removed in v4. func QuoteMeta(pat string, mode Mode) string { needsEscaping := false loop: for _, r := range pat { switch r { case '*', '?', '[', '\\': needsEscaping = true break loop } } if !needsEscaping { // short-cut without a string copy return pat } var sb strings.Builder for _, r := range pat { switch r { case '*', '?', '[', '\\': sb.WriteByte('\\') } sb.WriteRune(r) } return sb.String() }