464 lines
17 KiB
Python
464 lines
17 KiB
Python
"""
|
|
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
|
|
"""
|
|
|
|
import itertools
|
|
import posixpath
|
|
import re
|
|
import warnings
|
|
from pathlib import Path, PurePath
|
|
from typing import Optional, Pattern, Sequence, Tuple
|
|
|
|
from ._base import AbstractSanitizer, AbstractValidator, BaseFile, BaseValidator
|
|
from ._common import findall_to_str, is_nt_abspath, to_str, truncate_str, validate_pathtype
|
|
from ._const import DEFAULT_MIN_LEN, INVALID_CHAR_ERR_MSG_TMPL, Platform
|
|
from ._types import PathType, PlatformType
|
|
from .error import ErrorAttrKey, ErrorReason, InvalidCharError, ValidationError
|
|
from .handler import ReservedNameHandler, ValidationErrorHandler
|
|
|
|
|
|
_DEFAULT_MAX_FILENAME_LEN = 255
|
|
_RE_INVALID_FILENAME = re.compile(f"[{re.escape(BaseFile._INVALID_FILENAME_CHARS):s}]", re.UNICODE)
|
|
_RE_INVALID_WIN_FILENAME = re.compile(
|
|
f"[{re.escape(BaseFile._INVALID_WIN_FILENAME_CHARS):s}]", re.UNICODE
|
|
)
|
|
|
|
|
|
class FileNameSanitizer(AbstractSanitizer):
|
|
def __init__(
|
|
self,
|
|
max_len: int = _DEFAULT_MAX_FILENAME_LEN,
|
|
fs_encoding: Optional[str] = None,
|
|
platform: Optional[PlatformType] = None,
|
|
null_value_handler: Optional[ValidationErrorHandler] = None,
|
|
reserved_name_handler: Optional[ValidationErrorHandler] = None,
|
|
additional_reserved_names: Optional[Sequence[str]] = None,
|
|
validate_after_sanitize: bool = False,
|
|
validator: Optional[AbstractValidator] = None,
|
|
) -> None:
|
|
if validator:
|
|
fname_validator = validator
|
|
else:
|
|
fname_validator = FileNameValidator(
|
|
min_len=DEFAULT_MIN_LEN,
|
|
max_len=max_len,
|
|
fs_encoding=fs_encoding,
|
|
check_reserved=True,
|
|
additional_reserved_names=additional_reserved_names,
|
|
platform=platform,
|
|
)
|
|
|
|
super().__init__(
|
|
max_len=max_len,
|
|
fs_encoding=fs_encoding,
|
|
null_value_handler=null_value_handler,
|
|
reserved_name_handler=reserved_name_handler,
|
|
additional_reserved_names=additional_reserved_names,
|
|
platform=platform,
|
|
validate_after_sanitize=validate_after_sanitize,
|
|
validator=fname_validator,
|
|
)
|
|
|
|
self._sanitize_regexp = self._get_sanitize_regexp()
|
|
|
|
def sanitize(self, value: PathType, replacement_text: str = "") -> PathType:
|
|
try:
|
|
validate_pathtype(value, allow_whitespaces=not self._is_windows(include_universal=True))
|
|
except ValidationError as e:
|
|
if e.reason == ErrorReason.NULL_NAME:
|
|
if isinstance(value, PurePath):
|
|
raise
|
|
|
|
return self._null_value_handler(e) # type: ignore
|
|
raise
|
|
|
|
sanitized_filename = self._sanitize_regexp.sub(replacement_text, str(value))
|
|
sanitized_filename = truncate_str(sanitized_filename, self._fs_encoding, self.max_len)
|
|
|
|
try:
|
|
self._validator.validate(sanitized_filename)
|
|
except ValidationError as e:
|
|
if e.reason == ErrorReason.RESERVED_NAME:
|
|
replacement_word = self._reserved_name_handler(e)
|
|
if e.reserved_name != replacement_word:
|
|
sanitized_filename = re.sub(
|
|
re.escape(e.reserved_name), replacement_word, sanitized_filename
|
|
)
|
|
elif e.reason == ErrorReason.INVALID_CHARACTER and self._is_windows(
|
|
include_universal=True
|
|
):
|
|
# Do not start a file or directory name with a space
|
|
sanitized_filename = sanitized_filename.lstrip(" ")
|
|
|
|
# Do not end a file or directory name with a space or a period
|
|
sanitized_filename = sanitized_filename.rstrip(" ")
|
|
if sanitized_filename not in (".", ".."):
|
|
sanitized_filename = sanitized_filename.rstrip(" .")
|
|
elif e.reason == ErrorReason.NULL_NAME:
|
|
sanitized_filename = self._null_value_handler(e)
|
|
|
|
if self._validate_after_sanitize:
|
|
try:
|
|
self._validator.validate(sanitized_filename)
|
|
except ValidationError as e:
|
|
raise ValidationError(
|
|
description=str(e),
|
|
reason=ErrorReason.INVALID_AFTER_SANITIZE,
|
|
platform=self.platform,
|
|
)
|
|
|
|
if isinstance(value, PurePath):
|
|
return Path(sanitized_filename) # type: ignore
|
|
|
|
return sanitized_filename # type: ignore
|
|
|
|
def _get_sanitize_regexp(self) -> Pattern[str]:
|
|
if self._is_windows(include_universal=True):
|
|
return _RE_INVALID_WIN_FILENAME
|
|
|
|
return _RE_INVALID_FILENAME
|
|
|
|
|
|
class FileNameValidator(BaseValidator):
|
|
_WINDOWS_RESERVED_FILE_NAMES = ("CON", "PRN", "AUX", "CLOCK$", "NUL") + tuple(
|
|
f"{name:s}{num:d}" for name, num in itertools.product(("COM", "LPT"), range(1, 10))
|
|
)
|
|
_MACOS_RESERVED_FILE_NAMES = (":",)
|
|
|
|
@property
|
|
def reserved_keywords(self) -> Tuple[str, ...]:
|
|
common_keywords = super().reserved_keywords
|
|
|
|
if self._is_universal():
|
|
word_set = set(
|
|
common_keywords
|
|
+ self._WINDOWS_RESERVED_FILE_NAMES
|
|
+ self._MACOS_RESERVED_FILE_NAMES
|
|
)
|
|
elif self._is_windows():
|
|
word_set = set(common_keywords + self._WINDOWS_RESERVED_FILE_NAMES)
|
|
elif self._is_posix() or self._is_macos():
|
|
word_set = set(common_keywords + self._MACOS_RESERVED_FILE_NAMES)
|
|
else:
|
|
word_set = set(common_keywords)
|
|
|
|
return tuple(sorted(word_set))
|
|
|
|
def __init__(
|
|
self,
|
|
min_len: int = DEFAULT_MIN_LEN,
|
|
max_len: int = _DEFAULT_MAX_FILENAME_LEN,
|
|
fs_encoding: Optional[str] = None,
|
|
platform: Optional[PlatformType] = None,
|
|
check_reserved: bool = True,
|
|
additional_reserved_names: Optional[Sequence[str]] = None,
|
|
) -> None:
|
|
super().__init__(
|
|
min_len=min_len,
|
|
max_len=max_len,
|
|
fs_encoding=fs_encoding,
|
|
check_reserved=check_reserved,
|
|
additional_reserved_names=additional_reserved_names,
|
|
platform=platform,
|
|
)
|
|
|
|
def validate(self, value: PathType) -> None:
|
|
validate_pathtype(value, allow_whitespaces=not self._is_windows(include_universal=True))
|
|
|
|
unicode_filename = to_str(value)
|
|
byte_ct = len(unicode_filename.encode(self._fs_encoding))
|
|
|
|
self.validate_abspath(unicode_filename)
|
|
|
|
err_kwargs = {
|
|
ErrorAttrKey.REASON: ErrorReason.INVALID_LENGTH,
|
|
ErrorAttrKey.PLATFORM: self.platform,
|
|
ErrorAttrKey.FS_ENCODING: self._fs_encoding,
|
|
ErrorAttrKey.BYTE_COUNT: byte_ct,
|
|
}
|
|
if byte_ct > self.max_len:
|
|
raise ValidationError(
|
|
[
|
|
f"filename is too long: expected<={self.max_len:d} bytes, actual={byte_ct:d} bytes"
|
|
],
|
|
**err_kwargs,
|
|
)
|
|
if byte_ct < self.min_len:
|
|
raise ValidationError(
|
|
[
|
|
f"filename is too short: expected>={self.min_len:d} bytes, actual={byte_ct:d} bytes"
|
|
],
|
|
**err_kwargs,
|
|
)
|
|
|
|
self._validate_reserved_keywords(unicode_filename)
|
|
self.__validate_universal_filename(unicode_filename)
|
|
|
|
if self._is_windows(include_universal=True):
|
|
self.__validate_win_filename(unicode_filename)
|
|
|
|
def validate_abspath(self, value: str) -> None:
|
|
err = ValidationError(
|
|
description=f"found an absolute path ({value}), expected a filename",
|
|
platform=self.platform,
|
|
reason=ErrorReason.FOUND_ABS_PATH,
|
|
)
|
|
|
|
if self._is_windows(include_universal=True):
|
|
if is_nt_abspath(value):
|
|
raise err
|
|
|
|
if posixpath.isabs(value):
|
|
raise err
|
|
|
|
def __validate_universal_filename(self, unicode_filename: str) -> None:
|
|
match = _RE_INVALID_FILENAME.findall(unicode_filename)
|
|
if match:
|
|
raise InvalidCharError(
|
|
INVALID_CHAR_ERR_MSG_TMPL.format(
|
|
invalid=findall_to_str(match), value=repr(unicode_filename)
|
|
),
|
|
platform=Platform.UNIVERSAL,
|
|
)
|
|
|
|
def __validate_win_filename(self, unicode_filename: str) -> None:
|
|
match = _RE_INVALID_WIN_FILENAME.findall(unicode_filename)
|
|
if match:
|
|
raise InvalidCharError(
|
|
INVALID_CHAR_ERR_MSG_TMPL.format(
|
|
invalid=findall_to_str(match), value=repr(unicode_filename)
|
|
),
|
|
platform=Platform.WINDOWS,
|
|
)
|
|
|
|
if unicode_filename in (".", ".."):
|
|
return
|
|
|
|
KB2829981_err_tmpl = "{}. Refer: https://learn.microsoft.com/en-us/troubleshoot/windows-client/shell-experience/file-folder-name-whitespace-characters" # noqa: E501
|
|
|
|
if unicode_filename[-1] in (" ", "."):
|
|
raise InvalidCharError(
|
|
INVALID_CHAR_ERR_MSG_TMPL.format(
|
|
invalid=re.escape(unicode_filename[-1]), value=repr(unicode_filename)
|
|
),
|
|
platform=Platform.WINDOWS,
|
|
description=KB2829981_err_tmpl.format(
|
|
"Do not end a file or directory name with a space or a period"
|
|
),
|
|
)
|
|
|
|
if unicode_filename[0] in (" "):
|
|
raise InvalidCharError(
|
|
INVALID_CHAR_ERR_MSG_TMPL.format(
|
|
invalid=re.escape(unicode_filename[0]), value=repr(unicode_filename)
|
|
),
|
|
platform=Platform.WINDOWS,
|
|
description=KB2829981_err_tmpl.format(
|
|
"Do not start a file or directory name with a space"
|
|
),
|
|
)
|
|
|
|
|
|
def validate_filename(
|
|
filename: PathType,
|
|
platform: Optional[PlatformType] = None,
|
|
min_len: int = DEFAULT_MIN_LEN,
|
|
max_len: int = _DEFAULT_MAX_FILENAME_LEN,
|
|
fs_encoding: Optional[str] = None,
|
|
check_reserved: bool = True,
|
|
additional_reserved_names: Optional[Sequence[str]] = None,
|
|
) -> None:
|
|
"""Verifying whether the ``filename`` is a valid file name or not.
|
|
|
|
Args:
|
|
filename:
|
|
Filename to validate.
|
|
platform:
|
|
Target platform name of the filename.
|
|
|
|
.. include:: platform.txt
|
|
min_len:
|
|
Minimum byte length of the ``filename``. The value must be greater or equal to one.
|
|
Defaults to ``1``.
|
|
max_len:
|
|
Maximum byte length of the ``filename``. The value must be lower than:
|
|
|
|
- ``Linux``: 4096
|
|
- ``macOS``: 1024
|
|
- ``Windows``: 260
|
|
- ``universal``: 260
|
|
|
|
Defaults to ``255``.
|
|
fs_encoding:
|
|
Filesystem encoding that used to calculate the byte length of the filename.
|
|
If |None|, get the value from the execution environment.
|
|
check_reserved:
|
|
If |True|, check the reserved names of the ``platform``.
|
|
additional_reserved_names:
|
|
Additional reserved names to check.
|
|
Case insensitive.
|
|
|
|
Raises:
|
|
ValidationError (ErrorReason.INVALID_LENGTH):
|
|
If the ``filename`` is longer than ``max_len`` characters.
|
|
ValidationError (ErrorReason.INVALID_CHARACTER):
|
|
If the ``filename`` includes invalid character(s) for a filename:
|
|
|invalid_filename_chars|.
|
|
The following characters are also invalid for Windows platforms:
|
|
|invalid_win_filename_chars|.
|
|
ValidationError (ErrorReason.RESERVED_NAME):
|
|
If the ``filename`` equals the reserved name by OS.
|
|
Windows reserved name is as follows:
|
|
``"CON"``, ``"PRN"``, ``"AUX"``, ``"NUL"``, ``"COM[1-9]"``, ``"LPT[1-9]"``.
|
|
|
|
Example:
|
|
:ref:`example-validate-filename`
|
|
|
|
See Also:
|
|
`Naming Files, Paths, and Namespaces - Win32 apps | Microsoft Docs
|
|
<https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file>`__
|
|
"""
|
|
|
|
FileNameValidator(
|
|
platform=platform,
|
|
min_len=min_len,
|
|
max_len=max_len,
|
|
fs_encoding=fs_encoding,
|
|
check_reserved=check_reserved,
|
|
additional_reserved_names=additional_reserved_names,
|
|
).validate(filename)
|
|
|
|
|
|
def is_valid_filename(
|
|
filename: PathType,
|
|
platform: Optional[PlatformType] = None,
|
|
min_len: int = DEFAULT_MIN_LEN,
|
|
max_len: Optional[int] = None,
|
|
fs_encoding: Optional[str] = None,
|
|
check_reserved: bool = True,
|
|
additional_reserved_names: Optional[Sequence[str]] = None,
|
|
) -> bool:
|
|
"""Check whether the ``filename`` is a valid name or not.
|
|
|
|
Args:
|
|
filename:
|
|
A filename to be checked.
|
|
platform:
|
|
Target platform name of the filename.
|
|
|
|
Example:
|
|
:ref:`example-is-valid-filename`
|
|
|
|
See Also:
|
|
:py:func:`.validate_filename()`
|
|
"""
|
|
|
|
return FileNameValidator(
|
|
platform=platform,
|
|
min_len=min_len,
|
|
max_len=-1 if max_len is None else max_len,
|
|
fs_encoding=fs_encoding,
|
|
check_reserved=check_reserved,
|
|
additional_reserved_names=additional_reserved_names,
|
|
).is_valid(filename)
|
|
|
|
|
|
def sanitize_filename(
|
|
filename: PathType,
|
|
replacement_text: str = "",
|
|
platform: Optional[PlatformType] = None,
|
|
max_len: Optional[int] = _DEFAULT_MAX_FILENAME_LEN,
|
|
fs_encoding: Optional[str] = None,
|
|
check_reserved: Optional[bool] = None,
|
|
null_value_handler: Optional[ValidationErrorHandler] = None,
|
|
reserved_name_handler: Optional[ValidationErrorHandler] = None,
|
|
additional_reserved_names: Optional[Sequence[str]] = None,
|
|
validate_after_sanitize: bool = False,
|
|
) -> PathType:
|
|
"""Make a valid filename from a string.
|
|
|
|
To make a valid filename, the function does the following:
|
|
|
|
- Replace invalid characters as file names included in the ``filename``
|
|
with the ``replacement_text``. Invalid characters are:
|
|
|
|
- unprintable characters
|
|
- |invalid_filename_chars|
|
|
- for Windows (or universal) only: |invalid_win_filename_chars|
|
|
|
|
- Replace a value if a sanitized value is a reserved name by operating systems
|
|
with a specified handler by ``reserved_name_handler``.
|
|
|
|
Args:
|
|
filename: Filename to sanitize.
|
|
replacement_text:
|
|
Replacement text for invalid characters. Defaults to ``""``.
|
|
platform:
|
|
Target platform name of the filename.
|
|
|
|
.. include:: platform.txt
|
|
max_len:
|
|
Maximum byte length of the ``filename``.
|
|
Truncate the name length if the ``filename`` length exceeds this value.
|
|
Defaults to ``255``.
|
|
fs_encoding:
|
|
Filesystem encoding that used to calculate the byte length of the filename.
|
|
If |None|, get the value from the execution environment.
|
|
check_reserved:
|
|
[Deprecated] Use 'reserved_name_handler' instead.
|
|
null_value_handler:
|
|
Function called when a value after sanitization is an empty string.
|
|
You can specify predefined handlers:
|
|
|
|
- :py:func:`~.handler.NullValueHandler.return_null_string`
|
|
- :py:func:`~.handler.NullValueHandler.return_timestamp`
|
|
- :py:func:`~.handler.raise_error`
|
|
|
|
Defaults to :py:func:`.handler.NullValueHandler.return_null_string` that just return ``""``.
|
|
reserved_name_handler:
|
|
Function called when a value after sanitization is a reserved name.
|
|
You can specify predefined handlers:
|
|
|
|
- :py:meth:`~.handler.ReservedNameHandler.add_leading_underscore`
|
|
- :py:meth:`~.handler.ReservedNameHandler.add_trailing_underscore`
|
|
- :py:meth:`~.handler.ReservedNameHandler.as_is`
|
|
- :py:func:`~.handler.raise_error`
|
|
|
|
Defaults to :py:func:`.handler.add_trailing_underscore`.
|
|
additional_reserved_names:
|
|
Additional reserved names to sanitize.
|
|
Case insensitive.
|
|
validate_after_sanitize:
|
|
Execute validation after sanitization to the file name.
|
|
|
|
Returns:
|
|
Same type as the ``filename`` (str or PathLike object):
|
|
Sanitized filename.
|
|
|
|
Raises:
|
|
ValueError:
|
|
If the ``filename`` is an invalid filename.
|
|
|
|
Example:
|
|
:ref:`example-sanitize-filename`
|
|
"""
|
|
|
|
if check_reserved is not None:
|
|
warnings.warn(
|
|
"'check_reserved' is deprecated. Use 'reserved_name_handler' instead.",
|
|
DeprecationWarning,
|
|
)
|
|
|
|
if check_reserved is False:
|
|
reserved_name_handler = ReservedNameHandler.as_is
|
|
|
|
return FileNameSanitizer(
|
|
platform=platform,
|
|
max_len=-1 if max_len is None else max_len,
|
|
fs_encoding=fs_encoding,
|
|
null_value_handler=null_value_handler,
|
|
reserved_name_handler=reserved_name_handler,
|
|
additional_reserved_names=additional_reserved_names,
|
|
validate_after_sanitize=validate_after_sanitize,
|
|
).sanitize(filename, replacement_text)
|