Source code for duplicity.globmatch

# -*- Mode:Python; indent-tabs-mode:nil; tab-width:4; encoding:utf-8 -*-
#
# Copyright 2002 Ben Escoto <ben@emerose.org>
# Copyright 2007 Kenneth Loafman <kenneth@loafman.com>
# Copyright 2014 Aaron Whitehouse <aaron@whitehouse.kiwi.nz>
#
# This file is part of duplicity.
#
# Duplicity is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
#
# Duplicity is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with duplicity; if not, write to the Free Software Foundation,
# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
# All functions in this module only accept unicode. Any byte strings should
# be converted to unicode before sending them into this.

import re

from builtins import map
from builtins import range
from builtins import str


[docs]class GlobbingError(Exception):
    u"""Something has gone wrong when parsing a glob string"""
    pass


[docs]class FilePrefixError(GlobbingError):
    u"""Signals that a specified file doesn't start with correct prefix"""
    pass


[docs]def _glob_get_prefix_regexs(glob_str):
    u"""Return list of regexps equivalent to prefixes of glob_str"""
    # Internal. Used by glob_get_sf.
    glob_parts = glob_str.split(u"/")
    if u"" in glob_parts[1:-1]:
        # "" OK if comes first or last, as in /foo/
        raise GlobbingError(u"Consecutive '/'s found in globbing string " +
                            glob_str)

    prefixes = [u"/".join(glob_parts[:i + 1]) for i in range(len(glob_parts))]
    # we must make exception for root "/", only dir to end in slash
    if prefixes[0] == u"":
        prefixes[0] = u"/"
    return list(map(glob_to_regex, prefixes))


[docs]def select_fn_from_glob(glob_str, include, ignore_case=False):
    u"""Return a function test_fn(path) which
    tests whether path matches glob, as per the Unix shell rules, taking as
    arguments a path, a glob string and include (0 indicating that the glob
    string is an exclude glob and 1 indicating that it is an include glob,
    returning:

    0 - if the file should be excluded
    1 - if the file should be included
    2 - if the folder should be scanned for any included/excluded files
    None - if the selection function has nothing to say about the file


    The basic idea is to turn glob_str into a regular expression,
    and just use the normal regular expression.  There is a
    complication because the selection function should return '2'
    (scan) for directories which may contain a file which matches
    the glob_str.  So we break up the glob string into parts, and
    any file which matches an initial sequence of glob parts gets
    scanned.

    Thanks to Donovan Baarda who provided some code which did some
    things similar to this.

    Note: including a folder implicitly includes everything within it.
    """
    assert isinstance(glob_str, str)
    glob_ends_w_slash = False

    if glob_str == u"/":
        # If the glob string is '/', it implicitly includes everything
        glob_str = u"/**"
    elif glob_str[-1] == u"/":
        glob_ends_w_slash = True
        # Remove trailing / from directory name (unless that is the entire
        # string)
        glob_str = glob_str[:-1]

    flags = 0
    if ignore_case:
        flags = re.IGNORECASE

    re_comp = lambda r: re.compile(r, re.S | flags)

    # matches what glob matches and any files in directory
    # Resulting regular expression is:
    # ^ string must be at the beginning of path
    # string translated into regex
    # ($|/) nothing must follow except for the end of the string, newline or /
    # Note that the "/" at the end of the regex means that it will match
    # if the glob matches a parent folders of path, i.e. including a folder
    # includes everything within it.
    glob_comp_re = re_comp(u"^%s($|/)" % glob_to_regex(glob_str))

    if glob_ends_w_slash:
        # Creates a version of glob_comp_re that does not match folder contents
        # This can be used later to check that an exact match is actually a
        # folder, rather than a file.
        glob_comp_re_exact = re_comp(u"^%s($)" % glob_to_regex(glob_str))

    if glob_str.find(u"**") != -1:
        # glob_str has a ** in it
        glob_str = glob_str[:glob_str.find(u"**") + 2]  # truncate after **

    # Below regex is translates to:
    # ^ string must be at the beginning of path
    # the regexs corresponding to the parent directories of glob_str
    # $ nothing must follow except for the end of the string or newline
    scan_comp_re = re_comp(u"^(%s)$" %
                           u"|".join(_glob_get_prefix_regexs(glob_str)))

    def test_fn(path):
        assert not path.uc_name[-1] == u"/" or path.uc_name == u"/", \
            u"path.name should never end in '/' during normal operation for " \
            u"normal paths (except '/' alone)\n" \
            u"path.name here is " + path.uc_name + u" and glob is " + glob_str

        if glob_comp_re.match(path.uc_name):
            # Path matches glob, or is contained within a matching folder
            if not glob_ends_w_slash:
                return include
            else:
                # Glob ended with a /, so we need to check any exact match was
                # a folder
                if glob_comp_re_exact.match(path.uc_name):
                    # Not an included file/folder, so must be a folder to match
                    if path.isdir():
                        # Is a directory, so all is well
                        return include
                    else:
                        # Exact match and not a folder
                        return None
                else:
                    # An included file/folder, so normal approach is fine
                    return include

        elif include == 1 and scan_comp_re.match(path.uc_name):
            return 2
        else:
            return None

    return test_fn


[docs]def glob_to_regex(pat):
    u"""Returned regular expression equivalent to shell glob pat

    Currently only the ?, *, [], and ** expressions are supported.
    Ranges like [a-z] are currently unsupported.  There is no
    way to quote these special characters.

    This function taken with minor modifications from efnmatch.py
    by Donovan Baarda.

    """
    # Internal. Used by glob_get_sf, glob_get_prefix_res and unit tests.

    assert isinstance(pat, str)

    i, n, res = 0, len(pat), u''
    while i < n:
        c, s = pat[i], pat[i:i + 2]
        i = i + 1
        if s == u'**':
            res = res + u'.*'
            i = i + 1
        elif c == u'*':
            res = res + u'[^/]*'
        elif c == u'?':
            res = res + u'[^/]'
        elif c == u'[':
            j = i
            if j < n and pat[j] in u'!^':
                j = j + 1
            if j < n and pat[j] == u']':
                j = j + 1
            while j < n and pat[j] != u']':
                j = j + 1
            if j >= n:
                res = res + u'\\['  # interpret the [ literally
            else:
                # Deal with inside of [..]
                stuff = pat[i:j].replace(u'\\', u'\\\\')
                i = j + 1
                if stuff[0] in u'!^':
                    stuff = u'^' + stuff[1:]
                res = res + u'[' + stuff + u']'
        else:
            res = res + re.escape(c)
    return res