This commit is contained in:
2024-12-09 18:22:38 +09:00
parent ab0cbebefc
commit c4c4547706
959 changed files with 174888 additions and 6 deletions

8
env/bin/gitlab vendored Executable file
View File

@ -0,0 +1,8 @@
#!/home/dongho/netsec/env/bin/python
# -*- coding: utf-8 -*-
import re
import sys
from gitlab.cli import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

8
env/bin/normalizer vendored Executable file
View File

@ -0,0 +1,8 @@
#!/home/dongho/netsec/env/bin/python
# -*- coding: utf-8 -*-
import re
import sys
from charset_normalizer.cli import cli_detect
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(cli_detect())

View File

@ -0,0 +1 @@
pip

View File

@ -0,0 +1,20 @@
Copyright (c) 2017-2021 Ingy döt Net
Copyright (c) 2006-2016 Kirill Simonov
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -0,0 +1,46 @@
Metadata-Version: 2.1
Name: PyYAML
Version: 6.0.2
Summary: YAML parser and emitter for Python
Home-page: https://pyyaml.org/
Download-URL: https://pypi.org/project/PyYAML/
Author: Kirill Simonov
Author-email: xi@resolvent.net
License: MIT
Project-URL: Bug Tracker, https://github.com/yaml/pyyaml/issues
Project-URL: CI, https://github.com/yaml/pyyaml/actions
Project-URL: Documentation, https://pyyaml.org/wiki/PyYAMLDocumentation
Project-URL: Mailing lists, http://lists.sourceforge.net/lists/listinfo/yaml-core
Project-URL: Source Code, https://github.com/yaml/pyyaml
Platform: Any
Classifier: Development Status :: 5 - Production/Stable
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: MIT License
Classifier: Operating System :: OS Independent
Classifier: Programming Language :: Cython
Classifier: Programming Language :: Python
Classifier: Programming Language :: Python :: 3
Classifier: Programming Language :: Python :: 3.8
Classifier: Programming Language :: Python :: 3.9
Classifier: Programming Language :: Python :: 3.10
Classifier: Programming Language :: Python :: 3.11
Classifier: Programming Language :: Python :: 3.12
Classifier: Programming Language :: Python :: 3.13
Classifier: Programming Language :: Python :: Implementation :: CPython
Classifier: Programming Language :: Python :: Implementation :: PyPy
Classifier: Topic :: Software Development :: Libraries :: Python Modules
Classifier: Topic :: Text Processing :: Markup
Requires-Python: >=3.8
License-File: LICENSE
YAML is a data serialization format designed for human readability
and interaction with scripting languages. PyYAML is a YAML parser
and emitter for Python.
PyYAML features a complete YAML 1.1 parser, Unicode support, pickle
support, capable extension API, and sensible error messages. PyYAML
supports standard YAML tags and provides Python-specific tags that
allow to represent an arbitrary Python object.
PyYAML is applicable for a broad range of tasks from complex
configuration files to object serialization and persistence.

View File

@ -0,0 +1,44 @@
PyYAML-6.0.2.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
PyYAML-6.0.2.dist-info/LICENSE,sha256=jTko-dxEkP1jVwfLiOsmvXZBAqcoKVQwfT5RZ6V36KQ,1101
PyYAML-6.0.2.dist-info/METADATA,sha256=9-odFB5seu4pGPcEv7E8iyxNF51_uKnaNGjLAhz2lto,2060
PyYAML-6.0.2.dist-info/RECORD,,
PyYAML-6.0.2.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
PyYAML-6.0.2.dist-info/WHEEL,sha256=YM7r_UgTB_CA6ZLGHfbOA_dd7lb6fUn0DsfI9DvIHHE,154
PyYAML-6.0.2.dist-info/top_level.txt,sha256=rpj0IVMTisAjh_1vG3Ccf9v5jpCQwAz6cD1IVU5ZdhQ,11
_yaml/__init__.py,sha256=04Ae_5osxahpJHa3XBZUAf4wi6XX32gR8D6X6p64GEA,1402
_yaml/__pycache__/__init__.cpython-312.pyc,,
yaml/__init__.py,sha256=N35S01HMesFTe0aRRMWkPj0Pa8IEbHpE9FK7cr5Bdtw,12311
yaml/__pycache__/__init__.cpython-312.pyc,,
yaml/__pycache__/composer.cpython-312.pyc,,
yaml/__pycache__/constructor.cpython-312.pyc,,
yaml/__pycache__/cyaml.cpython-312.pyc,,
yaml/__pycache__/dumper.cpython-312.pyc,,
yaml/__pycache__/emitter.cpython-312.pyc,,
yaml/__pycache__/error.cpython-312.pyc,,
yaml/__pycache__/events.cpython-312.pyc,,
yaml/__pycache__/loader.cpython-312.pyc,,
yaml/__pycache__/nodes.cpython-312.pyc,,
yaml/__pycache__/parser.cpython-312.pyc,,
yaml/__pycache__/reader.cpython-312.pyc,,
yaml/__pycache__/representer.cpython-312.pyc,,
yaml/__pycache__/resolver.cpython-312.pyc,,
yaml/__pycache__/scanner.cpython-312.pyc,,
yaml/__pycache__/serializer.cpython-312.pyc,,
yaml/__pycache__/tokens.cpython-312.pyc,,
yaml/_yaml.cpython-312-aarch64-linux-gnu.so,sha256=kYQNF-yCT1TQJkdO87ihsv1jctF0lAaJ2wYRWZXqWRI,2456968
yaml/composer.py,sha256=_Ko30Wr6eDWUeUpauUGT3Lcg9QPBnOPVlTnIMRGJ9FM,4883
yaml/constructor.py,sha256=kNgkfaeLUkwQYY_Q6Ff1Tz2XVw_pG1xVE9Ak7z-viLA,28639
yaml/cyaml.py,sha256=6ZrAG9fAYvdVe2FK_w0hmXoG7ZYsoYUwapG8CiC72H0,3851
yaml/dumper.py,sha256=PLctZlYwZLp7XmeUdwRuv4nYOZ2UBnDIUy8-lKfLF-o,2837
yaml/emitter.py,sha256=jghtaU7eFwg31bG0B7RZea_29Adi9CKmXq_QjgQpCkQ,43006
yaml/error.py,sha256=Ah9z-toHJUbE9j-M8YpxgSRM5CgLCcwVzJgLLRF2Fxo,2533
yaml/events.py,sha256=50_TksgQiE4up-lKo_V-nBy-tAIxkIPQxY5qDhKCeHw,2445
yaml/loader.py,sha256=UVa-zIqmkFSCIYq_PgSGm4NSJttHY2Rf_zQ4_b1fHN0,2061
yaml/nodes.py,sha256=gPKNj8pKCdh2d4gr3gIYINnPOaOxGhJAUiYhGRnPE84,1440
yaml/parser.py,sha256=ilWp5vvgoHFGzvOZDItFoGjD6D42nhlZrZyjAwa0oJo,25495
yaml/reader.py,sha256=0dmzirOiDG4Xo41RnuQS7K9rkY3xjHiVasfDMNTqCNw,6794
yaml/representer.py,sha256=IuWP-cAW9sHKEnS0gCqSa894k1Bg4cgTxaDwIcbRQ-Y,14190
yaml/resolver.py,sha256=9L-VYfm4mWHxUD1Vg4X7rjDRK_7VZd6b92wzq7Y2IKY,9004
yaml/scanner.py,sha256=YEM3iLZSaQwXcQRg2l2R4MdT0zGP2F9eHkKGKnHyWQY,51279
yaml/serializer.py,sha256=ChuFgmhU01hj4xgI8GaKv6vfM2Bujwa9i7d2FAHj7cA,4165
yaml/tokens.py,sha256=lTQIzSVw8Mg9wv459-TjiOQe6wVziqaRlqX2_89rp54,2573

View File

@ -0,0 +1,6 @@
Wheel-Version: 1.0
Generator: bdist_wheel (0.44.0)
Root-Is-Purelib: false
Tag: cp312-cp312-manylinux_2_17_aarch64
Tag: cp312-cp312-manylinux2014_aarch64

View File

@ -0,0 +1,2 @@
_yaml
yaml

View File

@ -0,0 +1,33 @@
# This is a stub package designed to roughly emulate the _yaml
# extension module, which previously existed as a standalone module
# and has been moved into the `yaml` package namespace.
# It does not perfectly mimic its old counterpart, but should get
# close enough for anyone who's relying on it even when they shouldn't.
import yaml
# in some circumstances, the yaml module we imoprted may be from a different version, so we need
# to tread carefully when poking at it here (it may not have the attributes we expect)
if not getattr(yaml, '__with_libyaml__', False):
from sys import version_info
exc = ModuleNotFoundError if version_info >= (3, 6) else ImportError
raise exc("No module named '_yaml'")
else:
from yaml._yaml import *
import warnings
warnings.warn(
'The _yaml extension module is now located at yaml._yaml'
' and its location is subject to change. To use the'
' LibYAML-based parser and emitter, import from `yaml`:'
' `from yaml import CLoader as Loader, CDumper as Dumper`.',
DeprecationWarning
)
del warnings
# Don't `del yaml` here because yaml is actually an existing
# namespace member of _yaml.
__name__ = '_yaml'
# If the module is top-level (i.e. not a part of any specific package)
# then the attribute should be set to ''.
# https://docs.python.org/3.8/library/types.html
__package__ = ''

View File

@ -0,0 +1 @@
pip

View File

@ -0,0 +1,23 @@
# This is the MIT license
Copyright (c) 2010 ActiveState Software Inc.
Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be included
in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

View File

@ -0,0 +1,264 @@
Metadata-Version: 2.1
Name: appdirs
Version: 1.4.4
Summary: A small Python module for determining appropriate platform-specific dirs, e.g. a "user data dir".
Home-page: http://github.com/ActiveState/appdirs
Author: Trent Mick
Author-email: trentm@gmail.com
Maintainer: Jeff Rouse
Maintainer-email: jr@its.to
License: MIT
Keywords: application directory log cache user
Platform: UNKNOWN
Classifier: Development Status :: 5 - Production/Stable
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: MIT License
Classifier: Operating System :: OS Independent
Classifier: Programming Language :: Python :: 2
Classifier: Programming Language :: Python :: 2.7
Classifier: Programming Language :: Python :: 3
Classifier: Programming Language :: Python :: 3.4
Classifier: Programming Language :: Python :: 3.5
Classifier: Programming Language :: Python :: 3.6
Classifier: Programming Language :: Python :: 3.7
Classifier: Programming Language :: Python :: 3.8
Classifier: Programming Language :: Python :: Implementation :: PyPy
Classifier: Programming Language :: Python :: Implementation :: CPython
Classifier: Topic :: Software Development :: Libraries :: Python Modules
.. image:: https://secure.travis-ci.org/ActiveState/appdirs.png
:target: http://travis-ci.org/ActiveState/appdirs
the problem
===========
What directory should your app use for storing user data? If running on Mac OS X, you
should use::
~/Library/Application Support/<AppName>
If on Windows (at least English Win XP) that should be::
C:\Documents and Settings\<User>\Application Data\Local Settings\<AppAuthor>\<AppName>
or possibly::
C:\Documents and Settings\<User>\Application Data\<AppAuthor>\<AppName>
for `roaming profiles <http://bit.ly/9yl3b6>`_ but that is another story.
On Linux (and other Unices) the dir, according to the `XDG
spec <http://standards.freedesktop.org/basedir-spec/basedir-spec-latest.html>`_, is::
~/.local/share/<AppName>
``appdirs`` to the rescue
=========================
This kind of thing is what the ``appdirs`` module is for. ``appdirs`` will
help you choose an appropriate:
- user data dir (``user_data_dir``)
- user config dir (``user_config_dir``)
- user cache dir (``user_cache_dir``)
- site data dir (``site_data_dir``)
- site config dir (``site_config_dir``)
- user log dir (``user_log_dir``)
and also:
- is a single module so other Python packages can include their own private copy
- is slightly opinionated on the directory names used. Look for "OPINION" in
documentation and code for when an opinion is being applied.
some example output
===================
On Mac OS X::
>>> from appdirs import *
>>> appname = "SuperApp"
>>> appauthor = "Acme"
>>> user_data_dir(appname, appauthor)
'/Users/trentm/Library/Application Support/SuperApp'
>>> site_data_dir(appname, appauthor)
'/Library/Application Support/SuperApp'
>>> user_cache_dir(appname, appauthor)
'/Users/trentm/Library/Caches/SuperApp'
>>> user_log_dir(appname, appauthor)
'/Users/trentm/Library/Logs/SuperApp'
On Windows 7::
>>> from appdirs import *
>>> appname = "SuperApp"
>>> appauthor = "Acme"
>>> user_data_dir(appname, appauthor)
'C:\\Users\\trentm\\AppData\\Local\\Acme\\SuperApp'
>>> user_data_dir(appname, appauthor, roaming=True)
'C:\\Users\\trentm\\AppData\\Roaming\\Acme\\SuperApp'
>>> user_cache_dir(appname, appauthor)
'C:\\Users\\trentm\\AppData\\Local\\Acme\\SuperApp\\Cache'
>>> user_log_dir(appname, appauthor)
'C:\\Users\\trentm\\AppData\\Local\\Acme\\SuperApp\\Logs'
On Linux::
>>> from appdirs import *
>>> appname = "SuperApp"
>>> appauthor = "Acme"
>>> user_data_dir(appname, appauthor)
'/home/trentm/.local/share/SuperApp
>>> site_data_dir(appname, appauthor)
'/usr/local/share/SuperApp'
>>> site_data_dir(appname, appauthor, multipath=True)
'/usr/local/share/SuperApp:/usr/share/SuperApp'
>>> user_cache_dir(appname, appauthor)
'/home/trentm/.cache/SuperApp'
>>> user_log_dir(appname, appauthor)
'/home/trentm/.cache/SuperApp/log'
>>> user_config_dir(appname)
'/home/trentm/.config/SuperApp'
>>> site_config_dir(appname)
'/etc/xdg/SuperApp'
>>> os.environ['XDG_CONFIG_DIRS'] = '/etc:/usr/local/etc'
>>> site_config_dir(appname, multipath=True)
'/etc/SuperApp:/usr/local/etc/SuperApp'
``AppDirs`` for convenience
===========================
::
>>> from appdirs import AppDirs
>>> dirs = AppDirs("SuperApp", "Acme")
>>> dirs.user_data_dir
'/Users/trentm/Library/Application Support/SuperApp'
>>> dirs.site_data_dir
'/Library/Application Support/SuperApp'
>>> dirs.user_cache_dir
'/Users/trentm/Library/Caches/SuperApp'
>>> dirs.user_log_dir
'/Users/trentm/Library/Logs/SuperApp'
Per-version isolation
=====================
If you have multiple versions of your app in use that you want to be
able to run side-by-side, then you may want version-isolation for these
dirs::
>>> from appdirs import AppDirs
>>> dirs = AppDirs("SuperApp", "Acme", version="1.0")
>>> dirs.user_data_dir
'/Users/trentm/Library/Application Support/SuperApp/1.0'
>>> dirs.site_data_dir
'/Library/Application Support/SuperApp/1.0'
>>> dirs.user_cache_dir
'/Users/trentm/Library/Caches/SuperApp/1.0'
>>> dirs.user_log_dir
'/Users/trentm/Library/Logs/SuperApp/1.0'
appdirs Changelog
=================
appdirs 1.4.4
-------------
- [PR #92] Don't import appdirs from setup.py
Project officially classified as Stable which is important
for inclusion in other distros such as ActivePython.
First of several incremental releases to catch up on maintenance.
appdirs 1.4.3
-------------
- [PR #76] Python 3.6 invalid escape sequence deprecation fixes
- Fix for Python 3.6 support
appdirs 1.4.2
-------------
- [PR #84] Allow installing without setuptools
- [PR #86] Fix string delimiters in setup.py description
- Add Python 3.6 support
appdirs 1.4.1
-------------
- [issue #38] Fix _winreg import on Windows Py3
- [issue #55] Make appname optional
appdirs 1.4.0
-------------
- [PR #42] AppAuthor is now optional on Windows
- [issue 41] Support Jython on Windows, Mac, and Unix-like platforms. Windows
support requires `JNA <https://github.com/twall/jna>`_.
- [PR #44] Fix incorrect behaviour of the site_config_dir method
appdirs 1.3.0
-------------
- [Unix, issue 16] Conform to XDG standard, instead of breaking it for
everybody
- [Unix] Removes gratuitous case mangling of the case, since \*nix-es are
usually case sensitive, so mangling is not wise
- [Unix] Fixes the utterly wrong behaviour in ``site_data_dir``, return result
based on XDG_DATA_DIRS and make room for respecting the standard which
specifies XDG_DATA_DIRS is a multiple-value variable
- [Issue 6] Add ``*_config_dir`` which are distinct on nix-es, according to
XDG specs; on Windows and Mac return the corresponding ``*_data_dir``
appdirs 1.2.0
-------------
- [Unix] Put ``user_log_dir`` under the *cache* dir on Unix. Seems to be more
typical.
- [issue 9] Make ``unicode`` work on py3k.
appdirs 1.1.0
-------------
- [issue 4] Add ``AppDirs.user_log_dir``.
- [Unix, issue 2, issue 7] appdirs now conforms to `XDG base directory spec
<http://standards.freedesktop.org/basedir-spec/basedir-spec-latest.html>`_.
- [Mac, issue 5] Fix ``site_data_dir()`` on Mac.
- [Mac] Drop use of 'Carbon' module in favour of hardcoded paths; supports
Python3 now.
- [Windows] Append "Cache" to ``user_cache_dir`` on Windows by default. Use
``opinion=False`` option to disable this.
- Add ``appdirs.AppDirs`` convenience class. Usage:
>>> dirs = AppDirs("SuperApp", "Acme", version="1.0")
>>> dirs.user_data_dir
'/Users/trentm/Library/Application Support/SuperApp/1.0'
- [Windows] Cherry-pick Komodo's change to downgrade paths to the Windows short
paths if there are high bit chars.
- [Linux] Change default ``user_cache_dir()`` on Linux to be singular, e.g.
"~/.superapp/cache".
- [Windows] Add ``roaming`` option to ``user_data_dir()`` (for use on Windows only)
and change the default ``user_data_dir`` behaviour to use a *non*-roaming
profile dir (``CSIDL_LOCAL_APPDATA`` instead of ``CSIDL_APPDATA``). Why? Because
a large roaming profile can cause login speed issues. The "only syncs on
logout" behaviour can cause surprises in appdata info.
appdirs 1.0.1 (never released)
------------------------------
Started this changelog 27 July 2010. Before that this module originated in the
`Komodo <http://www.activestate.com/komodo>`_ product as ``applib.py`` and then
as `applib/location.py
<http://github.com/ActiveState/applib/blob/master/applib/location.py>`_ (used by
`PyPM <http://code.activestate.com/pypm/>`_ in `ActivePython
<http://www.activestate.com/activepython>`_). This is basically a fork of
applib.py 1.0.1 and applib/location.py 1.0.1.

View File

@ -0,0 +1,8 @@
__pycache__/appdirs.cpython-312.pyc,,
appdirs-1.4.4.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
appdirs-1.4.4.dist-info/LICENSE.txt,sha256=Nt200KdFqTqyAyA9cZCBSxuJcn0lTK_0jHp6-71HAAs,1097
appdirs-1.4.4.dist-info/METADATA,sha256=k5TVfXMNKGHTfp2wm6EJKTuGwGNuoQR5TqQgH8iwG8M,8981
appdirs-1.4.4.dist-info/RECORD,,
appdirs-1.4.4.dist-info/WHEEL,sha256=kGT74LWyRUZrL4VgLh6_g12IeVl_9u9ZVhadrgXZUEY,110
appdirs-1.4.4.dist-info/top_level.txt,sha256=nKncE8CUqZERJ6VuQWL4_bkunSPDNfn7KZqb4Tr5YEM,8
appdirs.py,sha256=g99s2sXhnvTEm79oj4bWI0Toapc-_SmKKNXvOXHkVic,24720

View File

@ -0,0 +1,6 @@
Wheel-Version: 1.0
Generator: bdist_wheel (0.34.2)
Root-Is-Purelib: true
Tag: py2-none-any
Tag: py3-none-any

View File

@ -0,0 +1 @@
appdirs

View File

@ -0,0 +1,608 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (c) 2005-2010 ActiveState Software Inc.
# Copyright (c) 2013 Eddy Petrișor
"""Utilities for determining application-specific dirs.
See <http://github.com/ActiveState/appdirs> for details and usage.
"""
# Dev Notes:
# - MSDN on where to store app data files:
# http://support.microsoft.com/default.aspx?scid=kb;en-us;310294#XSLTH3194121123120121120120
# - Mac OS X: http://developer.apple.com/documentation/MacOSX/Conceptual/BPFileSystem/index.html
# - XDG spec for Un*x: http://standards.freedesktop.org/basedir-spec/basedir-spec-latest.html
__version__ = "1.4.4"
__version_info__ = tuple(int(segment) for segment in __version__.split("."))
import sys
import os
PY3 = sys.version_info[0] == 3
if PY3:
unicode = str
if sys.platform.startswith('java'):
import platform
os_name = platform.java_ver()[3][0]
if os_name.startswith('Windows'): # "Windows XP", "Windows 7", etc.
system = 'win32'
elif os_name.startswith('Mac'): # "Mac OS X", etc.
system = 'darwin'
else: # "Linux", "SunOS", "FreeBSD", etc.
# Setting this to "linux2" is not ideal, but only Windows or Mac
# are actually checked for and the rest of the module expects
# *sys.platform* style strings.
system = 'linux2'
else:
system = sys.platform
def user_data_dir(appname=None, appauthor=None, version=None, roaming=False):
r"""Return full path to the user-specific data dir for this application.
"appname" is the name of application.
If None, just the system directory is returned.
"appauthor" (only used on Windows) is the name of the
appauthor or distributing body for this application. Typically
it is the owning company name. This falls back to appname. You may
pass False to disable it.
"version" is an optional version path element to append to the
path. You might want to use this if you want multiple versions
of your app to be able to run independently. If used, this
would typically be "<major>.<minor>".
Only applied when appname is present.
"roaming" (boolean, default False) can be set True to use the Windows
roaming appdata directory. That means that for users on a Windows
network setup for roaming profiles, this user data will be
sync'd on login. See
<http://technet.microsoft.com/en-us/library/cc766489(WS.10).aspx>
for a discussion of issues.
Typical user data directories are:
Mac OS X: ~/Library/Application Support/<AppName>
Unix: ~/.local/share/<AppName> # or in $XDG_DATA_HOME, if defined
Win XP (not roaming): C:\Documents and Settings\<username>\Application Data\<AppAuthor>\<AppName>
Win XP (roaming): C:\Documents and Settings\<username>\Local Settings\Application Data\<AppAuthor>\<AppName>
Win 7 (not roaming): C:\Users\<username>\AppData\Local\<AppAuthor>\<AppName>
Win 7 (roaming): C:\Users\<username>\AppData\Roaming\<AppAuthor>\<AppName>
For Unix, we follow the XDG spec and support $XDG_DATA_HOME.
That means, by default "~/.local/share/<AppName>".
"""
if system == "win32":
if appauthor is None:
appauthor = appname
const = roaming and "CSIDL_APPDATA" or "CSIDL_LOCAL_APPDATA"
path = os.path.normpath(_get_win_folder(const))
if appname:
if appauthor is not False:
path = os.path.join(path, appauthor, appname)
else:
path = os.path.join(path, appname)
elif system == 'darwin':
path = os.path.expanduser('~/Library/Application Support/')
if appname:
path = os.path.join(path, appname)
else:
path = os.getenv('XDG_DATA_HOME', os.path.expanduser("~/.local/share"))
if appname:
path = os.path.join(path, appname)
if appname and version:
path = os.path.join(path, version)
return path
def site_data_dir(appname=None, appauthor=None, version=None, multipath=False):
r"""Return full path to the user-shared data dir for this application.
"appname" is the name of application.
If None, just the system directory is returned.
"appauthor" (only used on Windows) is the name of the
appauthor or distributing body for this application. Typically
it is the owning company name. This falls back to appname. You may
pass False to disable it.
"version" is an optional version path element to append to the
path. You might want to use this if you want multiple versions
of your app to be able to run independently. If used, this
would typically be "<major>.<minor>".
Only applied when appname is present.
"multipath" is an optional parameter only applicable to *nix
which indicates that the entire list of data dirs should be
returned. By default, the first item from XDG_DATA_DIRS is
returned, or '/usr/local/share/<AppName>',
if XDG_DATA_DIRS is not set
Typical site data directories are:
Mac OS X: /Library/Application Support/<AppName>
Unix: /usr/local/share/<AppName> or /usr/share/<AppName>
Win XP: C:\Documents and Settings\All Users\Application Data\<AppAuthor>\<AppName>
Vista: (Fail! "C:\ProgramData" is a hidden *system* directory on Vista.)
Win 7: C:\ProgramData\<AppAuthor>\<AppName> # Hidden, but writeable on Win 7.
For Unix, this is using the $XDG_DATA_DIRS[0] default.
WARNING: Do not use this on Windows. See the Vista-Fail note above for why.
"""
if system == "win32":
if appauthor is None:
appauthor = appname
path = os.path.normpath(_get_win_folder("CSIDL_COMMON_APPDATA"))
if appname:
if appauthor is not False:
path = os.path.join(path, appauthor, appname)
else:
path = os.path.join(path, appname)
elif system == 'darwin':
path = os.path.expanduser('/Library/Application Support')
if appname:
path = os.path.join(path, appname)
else:
# XDG default for $XDG_DATA_DIRS
# only first, if multipath is False
path = os.getenv('XDG_DATA_DIRS',
os.pathsep.join(['/usr/local/share', '/usr/share']))
pathlist = [os.path.expanduser(x.rstrip(os.sep)) for x in path.split(os.pathsep)]
if appname:
if version:
appname = os.path.join(appname, version)
pathlist = [os.sep.join([x, appname]) for x in pathlist]
if multipath:
path = os.pathsep.join(pathlist)
else:
path = pathlist[0]
return path
if appname and version:
path = os.path.join(path, version)
return path
def user_config_dir(appname=None, appauthor=None, version=None, roaming=False):
r"""Return full path to the user-specific config dir for this application.
"appname" is the name of application.
If None, just the system directory is returned.
"appauthor" (only used on Windows) is the name of the
appauthor or distributing body for this application. Typically
it is the owning company name. This falls back to appname. You may
pass False to disable it.
"version" is an optional version path element to append to the
path. You might want to use this if you want multiple versions
of your app to be able to run independently. If used, this
would typically be "<major>.<minor>".
Only applied when appname is present.
"roaming" (boolean, default False) can be set True to use the Windows
roaming appdata directory. That means that for users on a Windows
network setup for roaming profiles, this user data will be
sync'd on login. See
<http://technet.microsoft.com/en-us/library/cc766489(WS.10).aspx>
for a discussion of issues.
Typical user config directories are:
Mac OS X: same as user_data_dir
Unix: ~/.config/<AppName> # or in $XDG_CONFIG_HOME, if defined
Win *: same as user_data_dir
For Unix, we follow the XDG spec and support $XDG_CONFIG_HOME.
That means, by default "~/.config/<AppName>".
"""
if system in ["win32", "darwin"]:
path = user_data_dir(appname, appauthor, None, roaming)
else:
path = os.getenv('XDG_CONFIG_HOME', os.path.expanduser("~/.config"))
if appname:
path = os.path.join(path, appname)
if appname and version:
path = os.path.join(path, version)
return path
def site_config_dir(appname=None, appauthor=None, version=None, multipath=False):
r"""Return full path to the user-shared data dir for this application.
"appname" is the name of application.
If None, just the system directory is returned.
"appauthor" (only used on Windows) is the name of the
appauthor or distributing body for this application. Typically
it is the owning company name. This falls back to appname. You may
pass False to disable it.
"version" is an optional version path element to append to the
path. You might want to use this if you want multiple versions
of your app to be able to run independently. If used, this
would typically be "<major>.<minor>".
Only applied when appname is present.
"multipath" is an optional parameter only applicable to *nix
which indicates that the entire list of config dirs should be
returned. By default, the first item from XDG_CONFIG_DIRS is
returned, or '/etc/xdg/<AppName>', if XDG_CONFIG_DIRS is not set
Typical site config directories are:
Mac OS X: same as site_data_dir
Unix: /etc/xdg/<AppName> or $XDG_CONFIG_DIRS[i]/<AppName> for each value in
$XDG_CONFIG_DIRS
Win *: same as site_data_dir
Vista: (Fail! "C:\ProgramData" is a hidden *system* directory on Vista.)
For Unix, this is using the $XDG_CONFIG_DIRS[0] default, if multipath=False
WARNING: Do not use this on Windows. See the Vista-Fail note above for why.
"""
if system in ["win32", "darwin"]:
path = site_data_dir(appname, appauthor)
if appname and version:
path = os.path.join(path, version)
else:
# XDG default for $XDG_CONFIG_DIRS
# only first, if multipath is False
path = os.getenv('XDG_CONFIG_DIRS', '/etc/xdg')
pathlist = [os.path.expanduser(x.rstrip(os.sep)) for x in path.split(os.pathsep)]
if appname:
if version:
appname = os.path.join(appname, version)
pathlist = [os.sep.join([x, appname]) for x in pathlist]
if multipath:
path = os.pathsep.join(pathlist)
else:
path = pathlist[0]
return path
def user_cache_dir(appname=None, appauthor=None, version=None, opinion=True):
r"""Return full path to the user-specific cache dir for this application.
"appname" is the name of application.
If None, just the system directory is returned.
"appauthor" (only used on Windows) is the name of the
appauthor or distributing body for this application. Typically
it is the owning company name. This falls back to appname. You may
pass False to disable it.
"version" is an optional version path element to append to the
path. You might want to use this if you want multiple versions
of your app to be able to run independently. If used, this
would typically be "<major>.<minor>".
Only applied when appname is present.
"opinion" (boolean) can be False to disable the appending of
"Cache" to the base app data dir for Windows. See
discussion below.
Typical user cache directories are:
Mac OS X: ~/Library/Caches/<AppName>
Unix: ~/.cache/<AppName> (XDG default)
Win XP: C:\Documents and Settings\<username>\Local Settings\Application Data\<AppAuthor>\<AppName>\Cache
Vista: C:\Users\<username>\AppData\Local\<AppAuthor>\<AppName>\Cache
On Windows the only suggestion in the MSDN docs is that local settings go in
the `CSIDL_LOCAL_APPDATA` directory. This is identical to the non-roaming
app data dir (the default returned by `user_data_dir` above). Apps typically
put cache data somewhere *under* the given dir here. Some examples:
...\Mozilla\Firefox\Profiles\<ProfileName>\Cache
...\Acme\SuperApp\Cache\1.0
OPINION: This function appends "Cache" to the `CSIDL_LOCAL_APPDATA` value.
This can be disabled with the `opinion=False` option.
"""
if system == "win32":
if appauthor is None:
appauthor = appname
path = os.path.normpath(_get_win_folder("CSIDL_LOCAL_APPDATA"))
if appname:
if appauthor is not False:
path = os.path.join(path, appauthor, appname)
else:
path = os.path.join(path, appname)
if opinion:
path = os.path.join(path, "Cache")
elif system == 'darwin':
path = os.path.expanduser('~/Library/Caches')
if appname:
path = os.path.join(path, appname)
else:
path = os.getenv('XDG_CACHE_HOME', os.path.expanduser('~/.cache'))
if appname:
path = os.path.join(path, appname)
if appname and version:
path = os.path.join(path, version)
return path
def user_state_dir(appname=None, appauthor=None, version=None, roaming=False):
r"""Return full path to the user-specific state dir for this application.
"appname" is the name of application.
If None, just the system directory is returned.
"appauthor" (only used on Windows) is the name of the
appauthor or distributing body for this application. Typically
it is the owning company name. This falls back to appname. You may
pass False to disable it.
"version" is an optional version path element to append to the
path. You might want to use this if you want multiple versions
of your app to be able to run independently. If used, this
would typically be "<major>.<minor>".
Only applied when appname is present.
"roaming" (boolean, default False) can be set True to use the Windows
roaming appdata directory. That means that for users on a Windows
network setup for roaming profiles, this user data will be
sync'd on login. See
<http://technet.microsoft.com/en-us/library/cc766489(WS.10).aspx>
for a discussion of issues.
Typical user state directories are:
Mac OS X: same as user_data_dir
Unix: ~/.local/state/<AppName> # or in $XDG_STATE_HOME, if defined
Win *: same as user_data_dir
For Unix, we follow this Debian proposal <https://wiki.debian.org/XDGBaseDirectorySpecification#state>
to extend the XDG spec and support $XDG_STATE_HOME.
That means, by default "~/.local/state/<AppName>".
"""
if system in ["win32", "darwin"]:
path = user_data_dir(appname, appauthor, None, roaming)
else:
path = os.getenv('XDG_STATE_HOME', os.path.expanduser("~/.local/state"))
if appname:
path = os.path.join(path, appname)
if appname and version:
path = os.path.join(path, version)
return path
def user_log_dir(appname=None, appauthor=None, version=None, opinion=True):
r"""Return full path to the user-specific log dir for this application.
"appname" is the name of application.
If None, just the system directory is returned.
"appauthor" (only used on Windows) is the name of the
appauthor or distributing body for this application. Typically
it is the owning company name. This falls back to appname. You may
pass False to disable it.
"version" is an optional version path element to append to the
path. You might want to use this if you want multiple versions
of your app to be able to run independently. If used, this
would typically be "<major>.<minor>".
Only applied when appname is present.
"opinion" (boolean) can be False to disable the appending of
"Logs" to the base app data dir for Windows, and "log" to the
base cache dir for Unix. See discussion below.
Typical user log directories are:
Mac OS X: ~/Library/Logs/<AppName>
Unix: ~/.cache/<AppName>/log # or under $XDG_CACHE_HOME if defined
Win XP: C:\Documents and Settings\<username>\Local Settings\Application Data\<AppAuthor>\<AppName>\Logs
Vista: C:\Users\<username>\AppData\Local\<AppAuthor>\<AppName>\Logs
On Windows the only suggestion in the MSDN docs is that local settings
go in the `CSIDL_LOCAL_APPDATA` directory. (Note: I'm interested in
examples of what some windows apps use for a logs dir.)
OPINION: This function appends "Logs" to the `CSIDL_LOCAL_APPDATA`
value for Windows and appends "log" to the user cache dir for Unix.
This can be disabled with the `opinion=False` option.
"""
if system == "darwin":
path = os.path.join(
os.path.expanduser('~/Library/Logs'),
appname)
elif system == "win32":
path = user_data_dir(appname, appauthor, version)
version = False
if opinion:
path = os.path.join(path, "Logs")
else:
path = user_cache_dir(appname, appauthor, version)
version = False
if opinion:
path = os.path.join(path, "log")
if appname and version:
path = os.path.join(path, version)
return path
class AppDirs(object):
"""Convenience wrapper for getting application dirs."""
def __init__(self, appname=None, appauthor=None, version=None,
roaming=False, multipath=False):
self.appname = appname
self.appauthor = appauthor
self.version = version
self.roaming = roaming
self.multipath = multipath
@property
def user_data_dir(self):
return user_data_dir(self.appname, self.appauthor,
version=self.version, roaming=self.roaming)
@property
def site_data_dir(self):
return site_data_dir(self.appname, self.appauthor,
version=self.version, multipath=self.multipath)
@property
def user_config_dir(self):
return user_config_dir(self.appname, self.appauthor,
version=self.version, roaming=self.roaming)
@property
def site_config_dir(self):
return site_config_dir(self.appname, self.appauthor,
version=self.version, multipath=self.multipath)
@property
def user_cache_dir(self):
return user_cache_dir(self.appname, self.appauthor,
version=self.version)
@property
def user_state_dir(self):
return user_state_dir(self.appname, self.appauthor,
version=self.version)
@property
def user_log_dir(self):
return user_log_dir(self.appname, self.appauthor,
version=self.version)
#---- internal support stuff
def _get_win_folder_from_registry(csidl_name):
"""This is a fallback technique at best. I'm not sure if using the
registry for this guarantees us the correct answer for all CSIDL_*
names.
"""
if PY3:
import winreg as _winreg
else:
import _winreg
shell_folder_name = {
"CSIDL_APPDATA": "AppData",
"CSIDL_COMMON_APPDATA": "Common AppData",
"CSIDL_LOCAL_APPDATA": "Local AppData",
}[csidl_name]
key = _winreg.OpenKey(
_winreg.HKEY_CURRENT_USER,
r"Software\Microsoft\Windows\CurrentVersion\Explorer\Shell Folders"
)
dir, type = _winreg.QueryValueEx(key, shell_folder_name)
return dir
def _get_win_folder_with_pywin32(csidl_name):
from win32com.shell import shellcon, shell
dir = shell.SHGetFolderPath(0, getattr(shellcon, csidl_name), 0, 0)
# Try to make this a unicode path because SHGetFolderPath does
# not return unicode strings when there is unicode data in the
# path.
try:
dir = unicode(dir)
# Downgrade to short path name if have highbit chars. See
# <http://bugs.activestate.com/show_bug.cgi?id=85099>.
has_high_char = False
for c in dir:
if ord(c) > 255:
has_high_char = True
break
if has_high_char:
try:
import win32api
dir = win32api.GetShortPathName(dir)
except ImportError:
pass
except UnicodeError:
pass
return dir
def _get_win_folder_with_ctypes(csidl_name):
import ctypes
csidl_const = {
"CSIDL_APPDATA": 26,
"CSIDL_COMMON_APPDATA": 35,
"CSIDL_LOCAL_APPDATA": 28,
}[csidl_name]
buf = ctypes.create_unicode_buffer(1024)
ctypes.windll.shell32.SHGetFolderPathW(None, csidl_const, None, 0, buf)
# Downgrade to short path name if have highbit chars. See
# <http://bugs.activestate.com/show_bug.cgi?id=85099>.
has_high_char = False
for c in buf:
if ord(c) > 255:
has_high_char = True
break
if has_high_char:
buf2 = ctypes.create_unicode_buffer(1024)
if ctypes.windll.kernel32.GetShortPathNameW(buf.value, buf2, 1024):
buf = buf2
return buf.value
def _get_win_folder_with_jna(csidl_name):
import array
from com.sun import jna
from com.sun.jna.platform import win32
buf_size = win32.WinDef.MAX_PATH * 2
buf = array.zeros('c', buf_size)
shell = win32.Shell32.INSTANCE
shell.SHGetFolderPath(None, getattr(win32.ShlObj, csidl_name), None, win32.ShlObj.SHGFP_TYPE_CURRENT, buf)
dir = jna.Native.toString(buf.tostring()).rstrip("\0")
# Downgrade to short path name if have highbit chars. See
# <http://bugs.activestate.com/show_bug.cgi?id=85099>.
has_high_char = False
for c in dir:
if ord(c) > 255:
has_high_char = True
break
if has_high_char:
buf = array.zeros('c', buf_size)
kernel = win32.Kernel32.INSTANCE
if kernel.GetShortPathName(dir, buf, buf_size):
dir = jna.Native.toString(buf.tostring()).rstrip("\0")
return dir
if system == "win32":
try:
import win32com.shell
_get_win_folder = _get_win_folder_with_pywin32
except ImportError:
try:
from ctypes import windll
_get_win_folder = _get_win_folder_with_ctypes
except ImportError:
try:
import com.sun.jna
_get_win_folder = _get_win_folder_with_jna
except ImportError:
_get_win_folder = _get_win_folder_from_registry
#---- self test code
if __name__ == "__main__":
appname = "MyApp"
appauthor = "MyCompany"
props = ("user_data_dir",
"user_config_dir",
"user_cache_dir",
"user_state_dir",
"user_log_dir",
"site_data_dir",
"site_config_dir")
print("-- app dirs %s --" % __version__)
print("-- app dirs (with optional 'version')")
dirs = AppDirs(appname, appauthor, version="1.0")
for prop in props:
print("%s: %s" % (prop, getattr(dirs, prop)))
print("\n-- app dirs (without optional 'version')")
dirs = AppDirs(appname, appauthor)
for prop in props:
print("%s: %s" % (prop, getattr(dirs, prop)))
print("\n-- app dirs (without optional 'appauthor')")
dirs = AppDirs(appname)
for prop in props:
print("%s: %s" % (prop, getattr(dirs, prop)))
print("\n-- app dirs (with disabled 'appauthor')")
dirs = AppDirs(appname, appauthor=False)
for prop in props:
print("%s: %s" % (prop, getattr(dirs, prop)))

View File

@ -0,0 +1 @@
pip

View File

@ -0,0 +1,20 @@
This package contains a modified version of ca-bundle.crt:
ca-bundle.crt -- Bundle of CA Root Certificates
This is a bundle of X.509 certificates of public Certificate Authorities
(CA). These were automatically extracted from Mozilla's root certificates
file (certdata.txt). This file can be found in the mozilla source tree:
https://hg.mozilla.org/mozilla-central/file/tip/security/nss/lib/ckfw/builtins/certdata.txt
It contains the certificates in PEM format and therefore
can be directly used with curl / libcurl / php_curl, or with
an Apache+mod_ssl webserver for SSL client authentication.
Just configure this file as the SSLCACertificateFile.#
***** BEGIN LICENSE BLOCK *****
This Source Code Form is subject to the terms of the Mozilla Public License,
v. 2.0. If a copy of the MPL was not distributed with this file, You can obtain
one at http://mozilla.org/MPL/2.0/.
***** END LICENSE BLOCK *****
@(#) $RCSfile: certdata.txt,v $ $Revision: 1.80 $ $Date: 2011/11/03 15:11:58 $

View File

@ -0,0 +1,67 @@
Metadata-Version: 2.1
Name: certifi
Version: 2024.8.30
Summary: Python package for providing Mozilla's CA Bundle.
Home-page: https://github.com/certifi/python-certifi
Author: Kenneth Reitz
Author-email: me@kennethreitz.com
License: MPL-2.0
Project-URL: Source, https://github.com/certifi/python-certifi
Classifier: Development Status :: 5 - Production/Stable
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)
Classifier: Natural Language :: English
Classifier: Programming Language :: Python
Classifier: Programming Language :: Python :: 3
Classifier: Programming Language :: Python :: 3 :: Only
Classifier: Programming Language :: Python :: 3.6
Classifier: Programming Language :: Python :: 3.7
Classifier: Programming Language :: Python :: 3.8
Classifier: Programming Language :: Python :: 3.9
Classifier: Programming Language :: Python :: 3.10
Classifier: Programming Language :: Python :: 3.11
Classifier: Programming Language :: Python :: 3.12
Requires-Python: >=3.6
License-File: LICENSE
Certifi: Python SSL Certificates
================================
Certifi provides Mozilla's carefully curated collection of Root Certificates for
validating the trustworthiness of SSL certificates while verifying the identity
of TLS hosts. It has been extracted from the `Requests`_ project.
Installation
------------
``certifi`` is available on PyPI. Simply install it with ``pip``::
$ pip install certifi
Usage
-----
To reference the installed certificate authority (CA) bundle, you can use the
built-in function::
>>> import certifi
>>> certifi.where()
'/usr/local/lib/python3.7/site-packages/certifi/cacert.pem'
Or from the command line::
$ python -m certifi
/usr/local/lib/python3.7/site-packages/certifi/cacert.pem
Enjoy!
.. _`Requests`: https://requests.readthedocs.io/en/master/
Addition/Removal of Certificates
--------------------------------
Certifi does not support any addition/removal or other modification of the
CA trust store content. This project is intended to provide a reliable and
highly portable root of trust to python deployments. Look to upstream projects
for methods to use alternate trust.

View File

@ -0,0 +1,14 @@
certifi-2024.8.30.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
certifi-2024.8.30.dist-info/LICENSE,sha256=6TcW2mucDVpKHfYP5pWzcPBpVgPSH2-D8FPkLPwQyvc,989
certifi-2024.8.30.dist-info/METADATA,sha256=GhBHRVUN6a4ZdUgE_N5wmukJfyuoE-QyIl8Y3ifNQBM,2222
certifi-2024.8.30.dist-info/RECORD,,
certifi-2024.8.30.dist-info/WHEEL,sha256=UvcQYKBHoFqaQd6LKyqHw9fxEolWLQnlzP0h_LgJAfI,91
certifi-2024.8.30.dist-info/top_level.txt,sha256=KMu4vUCfsjLrkPbSNdgdekS-pVJzBAJFO__nI8NF6-U,8
certifi/__init__.py,sha256=p_GYZrjUwPBUhpLlCZoGb0miKBKSqDAyZC5DvIuqbHQ,94
certifi/__main__.py,sha256=xBBoj905TUWBLRGANOcf7oi6e-3dMP4cEoG9OyMs11g,243
certifi/__pycache__/__init__.cpython-312.pyc,,
certifi/__pycache__/__main__.cpython-312.pyc,,
certifi/__pycache__/core.cpython-312.pyc,,
certifi/cacert.pem,sha256=lO3rZukXdPyuk6BWUJFOKQliWaXH6HGh9l1GGrUgG0c,299427
certifi/core.py,sha256=qRDDFyXVJwTB_EmoGppaXU_R9qCZvhl-EzxPMuV3nTA,4426
certifi/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0

View File

@ -0,0 +1,5 @@
Wheel-Version: 1.0
Generator: setuptools (74.0.0)
Root-Is-Purelib: true
Tag: py3-none-any

View File

@ -0,0 +1 @@
certifi

View File

@ -0,0 +1,4 @@
from .core import contents, where
__all__ = ["contents", "where"]
__version__ = "2024.08.30"

View File

@ -0,0 +1,12 @@
import argparse
from certifi import contents, where
parser = argparse.ArgumentParser()
parser.add_argument("-c", "--contents", action="store_true")
args = parser.parse_args()
if args.contents:
print(contents())
else:
print(where())

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,114 @@
"""
certifi.py
~~~~~~~~~~
This module returns the installation location of cacert.pem or its contents.
"""
import sys
import atexit
def exit_cacert_ctx() -> None:
_CACERT_CTX.__exit__(None, None, None) # type: ignore[union-attr]
if sys.version_info >= (3, 11):
from importlib.resources import as_file, files
_CACERT_CTX = None
_CACERT_PATH = None
def where() -> str:
# This is slightly terrible, but we want to delay extracting the file
# in cases where we're inside of a zipimport situation until someone
# actually calls where(), but we don't want to re-extract the file
# on every call of where(), so we'll do it once then store it in a
# global variable.
global _CACERT_CTX
global _CACERT_PATH
if _CACERT_PATH is None:
# This is slightly janky, the importlib.resources API wants you to
# manage the cleanup of this file, so it doesn't actually return a
# path, it returns a context manager that will give you the path
# when you enter it and will do any cleanup when you leave it. In
# the common case of not needing a temporary file, it will just
# return the file system location and the __exit__() is a no-op.
#
# We also have to hold onto the actual context manager, because
# it will do the cleanup whenever it gets garbage collected, so
# we will also store that at the global level as well.
_CACERT_CTX = as_file(files("certifi").joinpath("cacert.pem"))
_CACERT_PATH = str(_CACERT_CTX.__enter__())
atexit.register(exit_cacert_ctx)
return _CACERT_PATH
def contents() -> str:
return files("certifi").joinpath("cacert.pem").read_text(encoding="ascii")
elif sys.version_info >= (3, 7):
from importlib.resources import path as get_path, read_text
_CACERT_CTX = None
_CACERT_PATH = None
def where() -> str:
# This is slightly terrible, but we want to delay extracting the
# file in cases where we're inside of a zipimport situation until
# someone actually calls where(), but we don't want to re-extract
# the file on every call of where(), so we'll do it once then store
# it in a global variable.
global _CACERT_CTX
global _CACERT_PATH
if _CACERT_PATH is None:
# This is slightly janky, the importlib.resources API wants you
# to manage the cleanup of this file, so it doesn't actually
# return a path, it returns a context manager that will give
# you the path when you enter it and will do any cleanup when
# you leave it. In the common case of not needing a temporary
# file, it will just return the file system location and the
# __exit__() is a no-op.
#
# We also have to hold onto the actual context manager, because
# it will do the cleanup whenever it gets garbage collected, so
# we will also store that at the global level as well.
_CACERT_CTX = get_path("certifi", "cacert.pem")
_CACERT_PATH = str(_CACERT_CTX.__enter__())
atexit.register(exit_cacert_ctx)
return _CACERT_PATH
def contents() -> str:
return read_text("certifi", "cacert.pem", encoding="ascii")
else:
import os
import types
from typing import Union
Package = Union[types.ModuleType, str]
Resource = Union[str, "os.PathLike"]
# This fallback will work for Python versions prior to 3.7 that lack the
# importlib.resources module but relies on the existing `where` function
# so won't address issues with environments like PyOxidizer that don't set
# __file__ on modules.
def read_text(
package: Package,
resource: Resource,
encoding: str = 'utf-8',
errors: str = 'strict'
) -> str:
with open(where(), encoding=encoding) as data:
return data.read()
# If we don't have importlib.resources, then we will just do the old logic
# of assuming we're on the filesystem and munge the path directly.
def where() -> str:
f = os.path.dirname(__file__)
return os.path.join(f, "cacert.pem")
def contents() -> str:
return read_text("certifi", "cacert.pem", encoding="ascii")

View File

View File

@ -0,0 +1 @@
pip

View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2019 TAHRI Ahmed R.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -0,0 +1,695 @@
Metadata-Version: 2.1
Name: charset-normalizer
Version: 3.4.0
Summary: The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet.
Home-page: https://github.com/Ousret/charset_normalizer
Author: Ahmed TAHRI
Author-email: tahri.ahmed@proton.me
License: MIT
Project-URL: Bug Reports, https://github.com/Ousret/charset_normalizer/issues
Project-URL: Documentation, https://charset-normalizer.readthedocs.io/en/latest
Keywords: encoding,charset,charset-detector,detector,normalization,unicode,chardet,detect
Classifier: Development Status :: 5 - Production/Stable
Classifier: License :: OSI Approved :: MIT License
Classifier: Intended Audience :: Developers
Classifier: Topic :: Software Development :: Libraries :: Python Modules
Classifier: Operating System :: OS Independent
Classifier: Programming Language :: Python
Classifier: Programming Language :: Python :: 3
Classifier: Programming Language :: Python :: 3.7
Classifier: Programming Language :: Python :: 3.8
Classifier: Programming Language :: Python :: 3.9
Classifier: Programming Language :: Python :: 3.10
Classifier: Programming Language :: Python :: 3.11
Classifier: Programming Language :: Python :: 3.12
Classifier: Programming Language :: Python :: 3.13
Classifier: Programming Language :: Python :: Implementation :: PyPy
Classifier: Topic :: Text Processing :: Linguistic
Classifier: Topic :: Utilities
Classifier: Typing :: Typed
Requires-Python: >=3.7.0
Description-Content-Type: text/markdown
License-File: LICENSE
Provides-Extra: unicode_backport
<h1 align="center">Charset Detection, for Everyone 👋</h1>
<p align="center">
<sup>The Real First Universal Charset Detector</sup><br>
<a href="https://pypi.org/project/charset-normalizer">
<img src="https://img.shields.io/pypi/pyversions/charset_normalizer.svg?orange=blue" />
</a>
<a href="https://pepy.tech/project/charset-normalizer/">
<img alt="Download Count Total" src="https://static.pepy.tech/badge/charset-normalizer/month" />
</a>
<a href="https://bestpractices.coreinfrastructure.org/projects/7297">
<img src="https://bestpractices.coreinfrastructure.org/projects/7297/badge">
</a>
</p>
<p align="center">
<sup><i>Featured Packages</i></sup><br>
<a href="https://github.com/jawah/niquests">
<img alt="Static Badge" src="https://img.shields.io/badge/Niquests-HTTP_1.1%2C%202%2C_and_3_Client-cyan">
</a>
<a href="https://github.com/jawah/wassima">
<img alt="Static Badge" src="https://img.shields.io/badge/Wassima-Certifi_Killer-cyan">
</a>
</p>
<p align="center">
<sup><i>In other language (unofficial port - by the community)</i></sup><br>
<a href="https://github.com/nickspring/charset-normalizer-rs">
<img alt="Static Badge" src="https://img.shields.io/badge/Rust-red">
</a>
</p>
> A library that helps you read text from an unknown charset encoding.<br /> Motivated by `chardet`,
> I'm trying to resolve the issue by taking a new approach.
> All IANA character set names for which the Python core library provides codecs are supported.
<p align="center">
>>>>> <a href="https://charsetnormalizerweb.ousret.now.sh" target="_blank">👉 Try Me Online Now, Then Adopt Me 👈 </a> <<<<<
</p>
This project offers you an alternative to **Universal Charset Encoding Detector**, also known as **Chardet**.
| Feature | [Chardet](https://github.com/chardet/chardet) | Charset Normalizer | [cChardet](https://github.com/PyYoshi/cChardet) |
|--------------------------------------------------|:---------------------------------------------:|:--------------------------------------------------------------------------------------------------:|:-----------------------------------------------:|
| `Fast` | ❌ | ✅ | ✅ |
| `Universal**` | ❌ | ✅ | ❌ |
| `Reliable` **without** distinguishable standards | ❌ | ✅ | ✅ |
| `Reliable` **with** distinguishable standards | ✅ | ✅ | ✅ |
| `License` | LGPL-2.1<br>_restrictive_ | MIT | MPL-1.1<br>_restrictive_ |
| `Native Python` | ✅ | ✅ | ❌ |
| `Detect spoken language` | ❌ | ✅ | N/A |
| `UnicodeDecodeError Safety` | ❌ | ✅ | ❌ |
| `Whl Size (min)` | 193.6 kB | 42 kB | ~200 kB |
| `Supported Encoding` | 33 | 🎉 [99](https://charset-normalizer.readthedocs.io/en/latest/user/support.html#supported-encodings) | 40 |
<p align="center">
<img src="https://i.imgflip.com/373iay.gif" alt="Reading Normalized Text" width="226"/><img src="https://media.tenor.com/images/c0180f70732a18b4965448d33adba3d0/tenor.gif" alt="Cat Reading Text" width="200"/>
</p>
*\*\* : They are clearly using specific code for a specific encoding even if covering most of used one*<br>
Did you got there because of the logs? See [https://charset-normalizer.readthedocs.io/en/latest/user/miscellaneous.html](https://charset-normalizer.readthedocs.io/en/latest/user/miscellaneous.html)
## ⚡ Performance
This package offer better performance than its counterpart Chardet. Here are some numbers.
| Package | Accuracy | Mean per file (ms) | File per sec (est) |
|-----------------------------------------------|:--------:|:------------------:|:------------------:|
| [chardet](https://github.com/chardet/chardet) | 86 % | 200 ms | 5 file/sec |
| charset-normalizer | **98 %** | **10 ms** | 100 file/sec |
| Package | 99th percentile | 95th percentile | 50th percentile |
|-----------------------------------------------|:---------------:|:---------------:|:---------------:|
| [chardet](https://github.com/chardet/chardet) | 1200 ms | 287 ms | 23 ms |
| charset-normalizer | 100 ms | 50 ms | 5 ms |
Chardet's performance on larger file (1MB+) are very poor. Expect huge difference on large payload.
> Stats are generated using 400+ files using default parameters. More details on used files, see GHA workflows.
> And yes, these results might change at any time. The dataset can be updated to include more files.
> The actual delays heavily depends on your CPU capabilities. The factors should remain the same.
> Keep in mind that the stats are generous and that Chardet accuracy vs our is measured using Chardet initial capability
> (eg. Supported Encoding) Challenge-them if you want.
## ✨ Installation
Using pip:
```sh
pip install charset-normalizer -U
```
## 🚀 Basic Usage
### CLI
This package comes with a CLI.
```
usage: normalizer [-h] [-v] [-a] [-n] [-m] [-r] [-f] [-t THRESHOLD]
file [file ...]
The Real First Universal Charset Detector. Discover originating encoding used
on text file. Normalize text to unicode.
positional arguments:
files File(s) to be analysed
optional arguments:
-h, --help show this help message and exit
-v, --verbose Display complementary information about file if any.
Stdout will contain logs about the detection process.
-a, --with-alternative
Output complementary possibilities if any. Top-level
JSON WILL be a list.
-n, --normalize Permit to normalize input file. If not set, program
does not write anything.
-m, --minimal Only output the charset detected to STDOUT. Disabling
JSON output.
-r, --replace Replace file when trying to normalize it instead of
creating a new one.
-f, --force Replace file without asking if you are sure, use this
flag with caution.
-t THRESHOLD, --threshold THRESHOLD
Define a custom maximum amount of chaos allowed in
decoded content. 0. <= chaos <= 1.
--version Show version information and exit.
```
```bash
normalizer ./data/sample.1.fr.srt
```
or
```bash
python -m charset_normalizer ./data/sample.1.fr.srt
```
🎉 Since version 1.4.0 the CLI produce easily usable stdout result in JSON format.
```json
{
"path": "/home/default/projects/charset_normalizer/data/sample.1.fr.srt",
"encoding": "cp1252",
"encoding_aliases": [
"1252",
"windows_1252"
],
"alternative_encodings": [
"cp1254",
"cp1256",
"cp1258",
"iso8859_14",
"iso8859_15",
"iso8859_16",
"iso8859_3",
"iso8859_9",
"latin_1",
"mbcs"
],
"language": "French",
"alphabets": [
"Basic Latin",
"Latin-1 Supplement"
],
"has_sig_or_bom": false,
"chaos": 0.149,
"coherence": 97.152,
"unicode_path": null,
"is_preferred": true
}
```
### Python
*Just print out normalized text*
```python
from charset_normalizer import from_path
results = from_path('./my_subtitle.srt')
print(str(results.best()))
```
*Upgrade your code without effort*
```python
from charset_normalizer import detect
```
The above code will behave the same as **chardet**. We ensure that we offer the best (reasonable) BC result possible.
See the docs for advanced usage : [readthedocs.io](https://charset-normalizer.readthedocs.io/en/latest/)
## 😇 Why
When I started using Chardet, I noticed that it was not suited to my expectations, and I wanted to propose a
reliable alternative using a completely different method. Also! I never back down on a good challenge!
I **don't care** about the **originating charset** encoding, because **two different tables** can
produce **two identical rendered string.**
What I want is to get readable text, the best I can.
In a way, **I'm brute forcing text decoding.** How cool is that ? 😎
Don't confuse package **ftfy** with charset-normalizer or chardet. ftfy goal is to repair unicode string whereas charset-normalizer to convert raw file in unknown encoding to unicode.
## 🍰 How
- Discard all charset encoding table that could not fit the binary content.
- Measure noise, or the mess once opened (by chunks) with a corresponding charset encoding.
- Extract matches with the lowest mess detected.
- Additionally, we measure coherence / probe for a language.
**Wait a minute**, what is noise/mess and coherence according to **YOU ?**
*Noise :* I opened hundred of text files, **written by humans**, with the wrong encoding table. **I observed**, then
**I established** some ground rules about **what is obvious** when **it seems like** a mess.
I know that my interpretation of what is noise is probably incomplete, feel free to contribute in order to
improve or rewrite it.
*Coherence :* For each language there is on earth, we have computed ranked letter appearance occurrences (the best we can). So I thought
that intel is worth something here. So I use those records against decoded text to check if I can detect intelligent design.
## ⚡ Known limitations
- Language detection is unreliable when text contains two or more languages sharing identical letters. (eg. HTML (english tags) + Turkish content (Sharing Latin characters))
- Every charset detector heavily depends on sufficient content. In common cases, do not bother run detection on very tiny content.
## ⚠️ About Python EOLs
**If you are running:**
- Python >=2.7,<3.5: Unsupported
- Python 3.5: charset-normalizer < 2.1
- Python 3.6: charset-normalizer < 3.1
- Python 3.7: charset-normalizer < 4.0
Upgrade your Python interpreter as soon as possible.
## 👤 Contributing
Contributions, issues and feature requests are very much welcome.<br />
Feel free to check [issues page](https://github.com/ousret/charset_normalizer/issues) if you want to contribute.
## 📝 License
Copyright © [Ahmed TAHRI @Ousret](https://github.com/Ousret).<br />
This project is [MIT](https://github.com/Ousret/charset_normalizer/blob/master/LICENSE) licensed.
Characters frequencies used in this project © 2012 [Denny Vrandečić](http://simia.net/letters/)
## 💼 For Enterprise
Professional support for charset-normalizer is available as part of the [Tidelift
Subscription][1]. Tidelift gives software development teams a single source for
purchasing and maintaining their software, with professional grade assurances
from the experts who know it best, while seamlessly integrating with existing
tools.
[1]: https://tidelift.com/subscription/pkg/pypi-charset-normalizer?utm_source=pypi-charset-normalizer&utm_medium=readme
# Changelog
All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
## [3.4.0](https://github.com/Ousret/charset_normalizer/compare/3.3.2...3.4.0) (2024-10-08)
### Added
- Argument `--no-preemptive` in the CLI to prevent the detector to search for hints.
- Support for Python 3.13 (#512)
### Fixed
- Relax the TypeError exception thrown when trying to compare a CharsetMatch with anything else than a CharsetMatch.
- Improved the general reliability of the detector based on user feedbacks. (#520) (#509) (#498) (#407) (#537)
- Declared charset in content (preemptive detection) not changed when converting to utf-8 bytes. (#381)
## [3.3.2](https://github.com/Ousret/charset_normalizer/compare/3.3.1...3.3.2) (2023-10-31)
### Fixed
- Unintentional memory usage regression when using large payload that match several encoding (#376)
- Regression on some detection case showcased in the documentation (#371)
### Added
- Noise (md) probe that identify malformed arabic representation due to the presence of letters in isolated form (credit to my wife)
## [3.3.1](https://github.com/Ousret/charset_normalizer/compare/3.3.0...3.3.1) (2023-10-22)
### Changed
- Optional mypyc compilation upgraded to version 1.6.1 for Python >= 3.8
- Improved the general detection reliability based on reports from the community
## [3.3.0](https://github.com/Ousret/charset_normalizer/compare/3.2.0...3.3.0) (2023-09-30)
### Added
- Allow to execute the CLI (e.g. normalizer) through `python -m charset_normalizer.cli` or `python -m charset_normalizer`
- Support for 9 forgotten encoding that are supported by Python but unlisted in `encoding.aliases` as they have no alias (#323)
### Removed
- (internal) Redundant utils.is_ascii function and unused function is_private_use_only
- (internal) charset_normalizer.assets is moved inside charset_normalizer.constant
### Changed
- (internal) Unicode code blocks in constants are updated using the latest v15.0.0 definition to improve detection
- Optional mypyc compilation upgraded to version 1.5.1 for Python >= 3.8
### Fixed
- Unable to properly sort CharsetMatch when both chaos/noise and coherence were close due to an unreachable condition in \_\_lt\_\_ (#350)
## [3.2.0](https://github.com/Ousret/charset_normalizer/compare/3.1.0...3.2.0) (2023-06-07)
### Changed
- Typehint for function `from_path` no longer enforce `PathLike` as its first argument
- Minor improvement over the global detection reliability
### Added
- Introduce function `is_binary` that relies on main capabilities, and optimized to detect binaries
- Propagate `enable_fallback` argument throughout `from_bytes`, `from_path`, and `from_fp` that allow a deeper control over the detection (default True)
- Explicit support for Python 3.12
### Fixed
- Edge case detection failure where a file would contain 'very-long' camel cased word (Issue #289)
## [3.1.0](https://github.com/Ousret/charset_normalizer/compare/3.0.1...3.1.0) (2023-03-06)
### Added
- Argument `should_rename_legacy` for legacy function `detect` and disregard any new arguments without errors (PR #262)
### Removed
- Support for Python 3.6 (PR #260)
### Changed
- Optional speedup provided by mypy/c 1.0.1
## [3.0.1](https://github.com/Ousret/charset_normalizer/compare/3.0.0...3.0.1) (2022-11-18)
### Fixed
- Multi-bytes cutter/chunk generator did not always cut correctly (PR #233)
### Changed
- Speedup provided by mypy/c 0.990 on Python >= 3.7
## [3.0.0](https://github.com/Ousret/charset_normalizer/compare/2.1.1...3.0.0) (2022-10-20)
### Added
- Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
- Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
- Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
- `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
### Changed
- Build with static metadata using 'build' frontend
- Make the language detection stricter
- Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
### Fixed
- CLI with opt --normalize fail when using full path for files
- TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
- Sphinx warnings when generating the documentation
### Removed
- Coherence detector no longer return 'Simple English' instead return 'English'
- Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
- Breaking: Method `first()` and `best()` from CharsetMatch
- UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
- Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
- Breaking: Top-level function `normalize`
- Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
- Support for the backport `unicodedata2`
## [3.0.0rc1](https://github.com/Ousret/charset_normalizer/compare/3.0.0b2...3.0.0rc1) (2022-10-18)
### Added
- Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
- Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
- Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
### Changed
- Build with static metadata using 'build' frontend
- Make the language detection stricter
### Fixed
- CLI with opt --normalize fail when using full path for files
- TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
### Removed
- Coherence detector no longer return 'Simple English' instead return 'English'
- Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
## [3.0.0b2](https://github.com/Ousret/charset_normalizer/compare/3.0.0b1...3.0.0b2) (2022-08-21)
### Added
- `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
### Removed
- Breaking: Method `first()` and `best()` from CharsetMatch
- UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
### Fixed
- Sphinx warnings when generating the documentation
## [3.0.0b1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...3.0.0b1) (2022-08-15)
### Changed
- Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
### Removed
- Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
- Breaking: Top-level function `normalize`
- Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
- Support for the backport `unicodedata2`
## [2.1.1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...2.1.1) (2022-08-19)
### Deprecated
- Function `normalize` scheduled for removal in 3.0
### Changed
- Removed useless call to decode in fn is_unprintable (#206)
### Fixed
- Third-party library (i18n xgettext) crashing not recognizing utf_8 (PEP 263) with underscore from [@aleksandernovikov](https://github.com/aleksandernovikov) (#204)
## [2.1.0](https://github.com/Ousret/charset_normalizer/compare/2.0.12...2.1.0) (2022-06-19)
### Added
- Output the Unicode table version when running the CLI with `--version` (PR #194)
### Changed
- Re-use decoded buffer for single byte character sets from [@nijel](https://github.com/nijel) (PR #175)
- Fixing some performance bottlenecks from [@deedy5](https://github.com/deedy5) (PR #183)
### Fixed
- Workaround potential bug in cpython with Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space (PR #175)
- CLI default threshold aligned with the API threshold from [@oleksandr-kuzmenko](https://github.com/oleksandr-kuzmenko) (PR #181)
### Removed
- Support for Python 3.5 (PR #192)
### Deprecated
- Use of backport unicodedata from `unicodedata2` as Python is quickly catching up, scheduled for removal in 3.0 (PR #194)
## [2.0.12](https://github.com/Ousret/charset_normalizer/compare/2.0.11...2.0.12) (2022-02-12)
### Fixed
- ASCII miss-detection on rare cases (PR #170)
## [2.0.11](https://github.com/Ousret/charset_normalizer/compare/2.0.10...2.0.11) (2022-01-30)
### Added
- Explicit support for Python 3.11 (PR #164)
### Changed
- The logging behavior have been completely reviewed, now using only TRACE and DEBUG levels (PR #163 #165)
## [2.0.10](https://github.com/Ousret/charset_normalizer/compare/2.0.9...2.0.10) (2022-01-04)
### Fixed
- Fallback match entries might lead to UnicodeDecodeError for large bytes sequence (PR #154)
### Changed
- Skipping the language-detection (CD) on ASCII (PR #155)
## [2.0.9](https://github.com/Ousret/charset_normalizer/compare/2.0.8...2.0.9) (2021-12-03)
### Changed
- Moderating the logging impact (since 2.0.8) for specific environments (PR #147)
### Fixed
- Wrong logging level applied when setting kwarg `explain` to True (PR #146)
## [2.0.8](https://github.com/Ousret/charset_normalizer/compare/2.0.7...2.0.8) (2021-11-24)
### Changed
- Improvement over Vietnamese detection (PR #126)
- MD improvement on trailing data and long foreign (non-pure latin) data (PR #124)
- Efficiency improvements in cd/alphabet_languages from [@adbar](https://github.com/adbar) (PR #122)
- call sum() without an intermediary list following PEP 289 recommendations from [@adbar](https://github.com/adbar) (PR #129)
- Code style as refactored by Sourcery-AI (PR #131)
- Minor adjustment on the MD around european words (PR #133)
- Remove and replace SRTs from assets / tests (PR #139)
- Initialize the library logger with a `NullHandler` by default from [@nmaynes](https://github.com/nmaynes) (PR #135)
- Setting kwarg `explain` to True will add provisionally (bounded to function lifespan) a specific stream handler (PR #135)
### Fixed
- Fix large (misleading) sequence giving UnicodeDecodeError (PR #137)
- Avoid using too insignificant chunk (PR #137)
### Added
- Add and expose function `set_logging_handler` to configure a specific StreamHandler from [@nmaynes](https://github.com/nmaynes) (PR #135)
- Add `CHANGELOG.md` entries, format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) (PR #141)
## [2.0.7](https://github.com/Ousret/charset_normalizer/compare/2.0.6...2.0.7) (2021-10-11)
### Added
- Add support for Kazakh (Cyrillic) language detection (PR #109)
### Changed
- Further, improve inferring the language from a given single-byte code page (PR #112)
- Vainly trying to leverage PEP263 when PEP3120 is not supported (PR #116)
- Refactoring for potential performance improvements in loops from [@adbar](https://github.com/adbar) (PR #113)
- Various detection improvement (MD+CD) (PR #117)
### Removed
- Remove redundant logging entry about detected language(s) (PR #115)
### Fixed
- Fix a minor inconsistency between Python 3.5 and other versions regarding language detection (PR #117 #102)
## [2.0.6](https://github.com/Ousret/charset_normalizer/compare/2.0.5...2.0.6) (2021-09-18)
### Fixed
- Unforeseen regression with the loss of the backward-compatibility with some older minor of Python 3.5.x (PR #100)
- Fix CLI crash when using --minimal output in certain cases (PR #103)
### Changed
- Minor improvement to the detection efficiency (less than 1%) (PR #106 #101)
## [2.0.5](https://github.com/Ousret/charset_normalizer/compare/2.0.4...2.0.5) (2021-09-14)
### Changed
- The project now comply with: flake8, mypy, isort and black to ensure a better overall quality (PR #81)
- The BC-support with v1.x was improved, the old staticmethods are restored (PR #82)
- The Unicode detection is slightly improved (PR #93)
- Add syntax sugar \_\_bool\_\_ for results CharsetMatches list-container (PR #91)
### Removed
- The project no longer raise warning on tiny content given for detection, will be simply logged as warning instead (PR #92)
### Fixed
- In some rare case, the chunks extractor could cut in the middle of a multi-byte character and could mislead the mess detection (PR #95)
- Some rare 'space' characters could trip up the UnprintablePlugin/Mess detection (PR #96)
- The MANIFEST.in was not exhaustive (PR #78)
## [2.0.4](https://github.com/Ousret/charset_normalizer/compare/2.0.3...2.0.4) (2021-07-30)
### Fixed
- The CLI no longer raise an unexpected exception when no encoding has been found (PR #70)
- Fix accessing the 'alphabets' property when the payload contains surrogate characters (PR #68)
- The logger could mislead (explain=True) on detected languages and the impact of one MBCS match (PR #72)
- Submatch factoring could be wrong in rare edge cases (PR #72)
- Multiple files given to the CLI were ignored when publishing results to STDOUT. (After the first path) (PR #72)
- Fix line endings from CRLF to LF for certain project files (PR #67)
### Changed
- Adjust the MD to lower the sensitivity, thus improving the global detection reliability (PR #69 #76)
- Allow fallback on specified encoding if any (PR #71)
## [2.0.3](https://github.com/Ousret/charset_normalizer/compare/2.0.2...2.0.3) (2021-07-16)
### Changed
- Part of the detection mechanism has been improved to be less sensitive, resulting in more accurate detection results. Especially ASCII. (PR #63)
- According to the community wishes, the detection will fall back on ASCII or UTF-8 in a last-resort case. (PR #64)
## [2.0.2](https://github.com/Ousret/charset_normalizer/compare/2.0.1...2.0.2) (2021-07-15)
### Fixed
- Empty/Too small JSON payload miss-detection fixed. Report from [@tseaver](https://github.com/tseaver) (PR #59)
### Changed
- Don't inject unicodedata2 into sys.modules from [@akx](https://github.com/akx) (PR #57)
## [2.0.1](https://github.com/Ousret/charset_normalizer/compare/2.0.0...2.0.1) (2021-07-13)
### Fixed
- Make it work where there isn't a filesystem available, dropping assets frequencies.json. Report from [@sethmlarson](https://github.com/sethmlarson). (PR #55)
- Using explain=False permanently disable the verbose output in the current runtime (PR #47)
- One log entry (language target preemptive) was not show in logs when using explain=True (PR #47)
- Fix undesired exception (ValueError) on getitem of instance CharsetMatches (PR #52)
### Changed
- Public function normalize default args values were not aligned with from_bytes (PR #53)
### Added
- You may now use charset aliases in cp_isolation and cp_exclusion arguments (PR #47)
## [2.0.0](https://github.com/Ousret/charset_normalizer/compare/1.4.1...2.0.0) (2021-07-02)
### Changed
- 4x to 5 times faster than the previous 1.4.0 release. At least 2x faster than Chardet.
- Accent has been made on UTF-8 detection, should perform rather instantaneous.
- The backward compatibility with Chardet has been greatly improved. The legacy detect function returns an identical charset name whenever possible.
- The detection mechanism has been slightly improved, now Turkish content is detected correctly (most of the time)
- The program has been rewritten to ease the readability and maintainability. (+Using static typing)+
- utf_7 detection has been reinstated.
### Removed
- This package no longer require anything when used with Python 3.5 (Dropped cached_property)
- Removed support for these languages: Catalan, Esperanto, Kazakh, Baque, Volapük, Azeri, Galician, Nynorsk, Macedonian, and Serbocroatian.
- The exception hook on UnicodeDecodeError has been removed.
### Deprecated
- Methods coherence_non_latin, w_counter, chaos_secondary_pass of the class CharsetMatch are now deprecated and scheduled for removal in v3.0
### Fixed
- The CLI output used the relative path of the file(s). Should be absolute.
## [1.4.1](https://github.com/Ousret/charset_normalizer/compare/1.4.0...1.4.1) (2021-05-28)
### Fixed
- Logger configuration/usage no longer conflict with others (PR #44)
## [1.4.0](https://github.com/Ousret/charset_normalizer/compare/1.3.9...1.4.0) (2021-05-21)
### Removed
- Using standard logging instead of using the package loguru.
- Dropping nose test framework in favor of the maintained pytest.
- Choose to not use dragonmapper package to help with gibberish Chinese/CJK text.
- Require cached_property only for Python 3.5 due to constraint. Dropping for every other interpreter version.
- Stop support for UTF-7 that does not contain a SIG.
- Dropping PrettyTable, replaced with pure JSON output in CLI.
### Fixed
- BOM marker in a CharsetNormalizerMatch instance could be False in rare cases even if obviously present. Due to the sub-match factoring process.
- Not searching properly for the BOM when trying utf32/16 parent codec.
### Changed
- Improving the package final size by compressing frequencies.json.
- Huge improvement over the larges payload.
### Added
- CLI now produces JSON consumable output.
- Return ASCII if given sequences fit. Given reasonable confidence.
## [1.3.9](https://github.com/Ousret/charset_normalizer/compare/1.3.8...1.3.9) (2021-05-13)
### Fixed
- In some very rare cases, you may end up getting encode/decode errors due to a bad bytes payload (PR #40)
## [1.3.8](https://github.com/Ousret/charset_normalizer/compare/1.3.7...1.3.8) (2021-05-12)
### Fixed
- Empty given payload for detection may cause an exception if trying to access the `alphabets` property. (PR #39)
## [1.3.7](https://github.com/Ousret/charset_normalizer/compare/1.3.6...1.3.7) (2021-05-12)
### Fixed
- The legacy detect function should return UTF-8-SIG if sig is present in the payload. (PR #38)
## [1.3.6](https://github.com/Ousret/charset_normalizer/compare/1.3.5...1.3.6) (2021-02-09)
### Changed
- Amend the previous release to allow prettytable 2.0 (PR #35)
## [1.3.5](https://github.com/Ousret/charset_normalizer/compare/1.3.4...1.3.5) (2021-02-08)
### Fixed
- Fix error while using the package with a python pre-release interpreter (PR #33)
### Changed
- Dependencies refactoring, constraints revised.
### Added
- Add python 3.9 and 3.10 to the supported interpreters
MIT License
Copyright (c) 2019 TAHRI Ahmed R.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -0,0 +1,35 @@
../../../bin/normalizer,sha256=pWxmMYA_SquLIU6d0ASgK3copKj6QWxw28YimXUHlzw,251
charset_normalizer-3.4.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
charset_normalizer-3.4.0.dist-info/LICENSE,sha256=6zGgxaT7Cbik4yBV0lweX5w1iidS_vPNcgIT0cz-4kE,1070
charset_normalizer-3.4.0.dist-info/METADATA,sha256=WGbEW9ehh2spNJxo1M6sEGGZWmsQ-oj2DsMjV29zoms,34159
charset_normalizer-3.4.0.dist-info/RECORD,,
charset_normalizer-3.4.0.dist-info/WHEEL,sha256=Z868N0_Fq1ssfDKgnQWj75ig0pzypFewyov-H4g6Btc,153
charset_normalizer-3.4.0.dist-info/entry_points.txt,sha256=ADSTKrkXZ3hhdOVFi6DcUEHQRS0xfxDIE_pEz4wLIXA,65
charset_normalizer-3.4.0.dist-info/top_level.txt,sha256=7ASyzePr8_xuZWJsnqJjIBtyV8vhEo0wBCv1MPRRi3Q,19
charset_normalizer/__init__.py,sha256=UzI3xC8PhmcLRMzSgPb6minTmRq0kWznnCBJ8ZCc2XI,1577
charset_normalizer/__main__.py,sha256=JxY8bleaENOFlLRb9HfoeZCzAMnn2A1oGR5Xm2eyqg0,73
charset_normalizer/__pycache__/__init__.cpython-312.pyc,,
charset_normalizer/__pycache__/__main__.cpython-312.pyc,,
charset_normalizer/__pycache__/api.cpython-312.pyc,,
charset_normalizer/__pycache__/cd.cpython-312.pyc,,
charset_normalizer/__pycache__/constant.cpython-312.pyc,,
charset_normalizer/__pycache__/legacy.cpython-312.pyc,,
charset_normalizer/__pycache__/md.cpython-312.pyc,,
charset_normalizer/__pycache__/models.cpython-312.pyc,,
charset_normalizer/__pycache__/utils.cpython-312.pyc,,
charset_normalizer/__pycache__/version.cpython-312.pyc,,
charset_normalizer/api.py,sha256=kMyNUqrfBZU22PP0pYKrSldtYUGA24wsGlXGLAKra7c,22559
charset_normalizer/cd.py,sha256=xwZliZcTQFA3jU0c00PRiu9MNxXTFxQkFLWmMW24ZzI,12560
charset_normalizer/cli/__init__.py,sha256=D5ERp8P62llm2FuoMzydZ7d9rs8cvvLXqE-1_6oViPc,100
charset_normalizer/cli/__main__.py,sha256=zX9sV_ApU1d96Wb0cS04vulstdB4F0Eh7kLn-gevfw4,10411
charset_normalizer/cli/__pycache__/__init__.cpython-312.pyc,,
charset_normalizer/cli/__pycache__/__main__.cpython-312.pyc,,
charset_normalizer/constant.py,sha256=uwoW87NicWZDTLviX7le0wdoYBbhBQDA4n1JtJo77ts,40499
charset_normalizer/legacy.py,sha256=XJjkT0hejMH8qfAKz1ts8OUiBT18t2FJP3tJgLwUWwc,2327
charset_normalizer/md.cpython-312-aarch64-linux-gnu.so,sha256=medVy2qYxvmhqZLDgu6sOFWJ_3LJ2X3o-RJovGFelks,69800
charset_normalizer/md.py,sha256=SIIZcENrslI7h3v4GigbFN61fRyE_wiCN1z9Ii3fBRo,20138
charset_normalizer/md__mypyc.cpython-312-aarch64-linux-gnu.so,sha256=sxeTw_aoOZt6lM09TkDdRVjlOp1FyW8wJQWSCrj5ldc,322008
charset_normalizer/models.py,sha256=oAMAcBSEY7CngbUXJp34Wc4Rl9NKJJjGmUwW3EPtk6g,12425
charset_normalizer/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
charset_normalizer/utils.py,sha256=teiosMqzKjXyAHXnGdjSBOgnBZwx-SkBbCLrx0UXy8M,11894
charset_normalizer/version.py,sha256=AX66S4ytQFdd6F5jbVU2OPMqYwFS5M3BkMvyX-3BKF8,79

View File

@ -0,0 +1,6 @@
Wheel-Version: 1.0
Generator: setuptools (75.1.0)
Root-Is-Purelib: false
Tag: cp312-cp312-manylinux_2_17_aarch64
Tag: cp312-cp312-manylinux2014_aarch64

View File

@ -0,0 +1,2 @@
[console_scripts]
normalizer = charset_normalizer.cli:cli_detect

View File

@ -0,0 +1 @@
charset_normalizer

View File

@ -0,0 +1,46 @@
# -*- coding: utf-8 -*-
"""
Charset-Normalizer
~~~~~~~~~~~~~~
The Real First Universal Charset Detector.
A library that helps you read text from an unknown charset encoding.
Motivated by chardet, This package is trying to resolve the issue by taking a new approach.
All IANA character set names for which the Python core library provides codecs are supported.
Basic usage:
>>> from charset_normalizer import from_bytes
>>> results = from_bytes('Bсеки човек има право на образование. Oбразованието!'.encode('utf_8'))
>>> best_guess = results.best()
>>> str(best_guess)
'Bсеки човек има право на образование. Oбразованието!'
Others methods and usages are available - see the full documentation
at <https://github.com/Ousret/charset_normalizer>.
:copyright: (c) 2021 by Ahmed TAHRI
:license: MIT, see LICENSE for more details.
"""
import logging
from .api import from_bytes, from_fp, from_path, is_binary
from .legacy import detect
from .models import CharsetMatch, CharsetMatches
from .utils import set_logging_handler
from .version import VERSION, __version__
__all__ = (
"from_fp",
"from_path",
"from_bytes",
"is_binary",
"detect",
"CharsetMatch",
"CharsetMatches",
"__version__",
"VERSION",
"set_logging_handler",
)
# Attach a NullHandler to the top level logger by default
# https://docs.python.org/3.3/howto/logging.html#configuring-logging-for-a-library
logging.getLogger("charset_normalizer").addHandler(logging.NullHandler())

View File

@ -0,0 +1,4 @@
from .cli import cli_detect
if __name__ == "__main__":
cli_detect()

View File

@ -0,0 +1,668 @@
import logging
from os import PathLike
from typing import BinaryIO, List, Optional, Set, Union
from .cd import (
coherence_ratio,
encoding_languages,
mb_encoding_languages,
merge_coherence_ratios,
)
from .constant import IANA_SUPPORTED, TOO_BIG_SEQUENCE, TOO_SMALL_SEQUENCE, TRACE
from .md import mess_ratio
from .models import CharsetMatch, CharsetMatches
from .utils import (
any_specified_encoding,
cut_sequence_chunks,
iana_name,
identify_sig_or_bom,
is_cp_similar,
is_multi_byte_encoding,
should_strip_sig_or_bom,
)
# Will most likely be controversial
# logging.addLevelName(TRACE, "TRACE")
logger = logging.getLogger("charset_normalizer")
explain_handler = logging.StreamHandler()
explain_handler.setFormatter(
logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")
)
def from_bytes(
sequences: Union[bytes, bytearray],
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.2,
cp_isolation: Optional[List[str]] = None,
cp_exclusion: Optional[List[str]] = None,
preemptive_behaviour: bool = True,
explain: bool = False,
language_threshold: float = 0.1,
enable_fallback: bool = True,
) -> CharsetMatches:
"""
Given a raw bytes sequence, return the best possibles charset usable to render str objects.
If there is no results, it is a strong indicator that the source is binary/not text.
By default, the process will extract 5 blocks of 512o each to assess the mess and coherence of a given sequence.
And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.
The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
but never take it for granted. Can improve the performance.
You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
purpose.
This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain'
toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging.
Custom logging format and handler can be set manually.
"""
if not isinstance(sequences, (bytearray, bytes)):
raise TypeError(
"Expected object of type bytes or bytearray, got: {0}".format(
type(sequences)
)
)
if explain:
previous_logger_level: int = logger.level
logger.addHandler(explain_handler)
logger.setLevel(TRACE)
length: int = len(sequences)
if length == 0:
logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
if explain:
logger.removeHandler(explain_handler)
logger.setLevel(previous_logger_level or logging.WARNING)
return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])
if cp_isolation is not None:
logger.log(
TRACE,
"cp_isolation is set. use this flag for debugging purpose. "
"limited list of encoding allowed : %s.",
", ".join(cp_isolation),
)
cp_isolation = [iana_name(cp, False) for cp in cp_isolation]
else:
cp_isolation = []
if cp_exclusion is not None:
logger.log(
TRACE,
"cp_exclusion is set. use this flag for debugging purpose. "
"limited list of encoding excluded : %s.",
", ".join(cp_exclusion),
)
cp_exclusion = [iana_name(cp, False) for cp in cp_exclusion]
else:
cp_exclusion = []
if length <= (chunk_size * steps):
logger.log(
TRACE,
"override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.",
steps,
chunk_size,
length,
)
steps = 1
chunk_size = length
if steps > 1 and length / steps < chunk_size:
chunk_size = int(length / steps)
is_too_small_sequence: bool = len(sequences) < TOO_SMALL_SEQUENCE
is_too_large_sequence: bool = len(sequences) >= TOO_BIG_SEQUENCE
if is_too_small_sequence:
logger.log(
TRACE,
"Trying to detect encoding from a tiny portion of ({}) byte(s).".format(
length
),
)
elif is_too_large_sequence:
logger.log(
TRACE,
"Using lazy str decoding because the payload is quite large, ({}) byte(s).".format(
length
),
)
prioritized_encodings: List[str] = []
specified_encoding: Optional[str] = (
any_specified_encoding(sequences) if preemptive_behaviour else None
)
if specified_encoding is not None:
prioritized_encodings.append(specified_encoding)
logger.log(
TRACE,
"Detected declarative mark in sequence. Priority +1 given for %s.",
specified_encoding,
)
tested: Set[str] = set()
tested_but_hard_failure: List[str] = []
tested_but_soft_failure: List[str] = []
fallback_ascii: Optional[CharsetMatch] = None
fallback_u8: Optional[CharsetMatch] = None
fallback_specified: Optional[CharsetMatch] = None
results: CharsetMatches = CharsetMatches()
early_stop_results: CharsetMatches = CharsetMatches()
sig_encoding, sig_payload = identify_sig_or_bom(sequences)
if sig_encoding is not None:
prioritized_encodings.append(sig_encoding)
logger.log(
TRACE,
"Detected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.",
len(sig_payload),
sig_encoding,
)
prioritized_encodings.append("ascii")
if "utf_8" not in prioritized_encodings:
prioritized_encodings.append("utf_8")
for encoding_iana in prioritized_encodings + IANA_SUPPORTED:
if cp_isolation and encoding_iana not in cp_isolation:
continue
if cp_exclusion and encoding_iana in cp_exclusion:
continue
if encoding_iana in tested:
continue
tested.add(encoding_iana)
decoded_payload: Optional[str] = None
bom_or_sig_available: bool = sig_encoding == encoding_iana
strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(
encoding_iana
)
if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
logger.log(
TRACE,
"Encoding %s won't be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
encoding_iana,
)
continue
if encoding_iana in {"utf_7"} and not bom_or_sig_available:
logger.log(
TRACE,
"Encoding %s won't be tested as-is because detection is unreliable without BOM/SIG.",
encoding_iana,
)
continue
try:
is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana)
except (ModuleNotFoundError, ImportError):
logger.log(
TRACE,
"Encoding %s does not provide an IncrementalDecoder",
encoding_iana,
)
continue
try:
if is_too_large_sequence and is_multi_byte_decoder is False:
str(
(
sequences[: int(50e4)]
if strip_sig_or_bom is False
else sequences[len(sig_payload) : int(50e4)]
),
encoding=encoding_iana,
)
else:
decoded_payload = str(
(
sequences
if strip_sig_or_bom is False
else sequences[len(sig_payload) :]
),
encoding=encoding_iana,
)
except (UnicodeDecodeError, LookupError) as e:
if not isinstance(e, LookupError):
logger.log(
TRACE,
"Code page %s does not fit given bytes sequence at ALL. %s",
encoding_iana,
str(e),
)
tested_but_hard_failure.append(encoding_iana)
continue
similar_soft_failure_test: bool = False
for encoding_soft_failed in tested_but_soft_failure:
if is_cp_similar(encoding_iana, encoding_soft_failed):
similar_soft_failure_test = True
break
if similar_soft_failure_test:
logger.log(
TRACE,
"%s is deemed too similar to code page %s and was consider unsuited already. Continuing!",
encoding_iana,
encoding_soft_failed,
)
continue
r_ = range(
0 if not bom_or_sig_available else len(sig_payload),
length,
int(length / steps),
)
multi_byte_bonus: bool = (
is_multi_byte_decoder
and decoded_payload is not None
and len(decoded_payload) < length
)
if multi_byte_bonus:
logger.log(
TRACE,
"Code page %s is a multi byte encoding table and it appear that at least one character "
"was encoded using n-bytes.",
encoding_iana,
)
max_chunk_gave_up: int = int(len(r_) / 4)
max_chunk_gave_up = max(max_chunk_gave_up, 2)
early_stop_count: int = 0
lazy_str_hard_failure = False
md_chunks: List[str] = []
md_ratios = []
try:
for chunk in cut_sequence_chunks(
sequences,
encoding_iana,
r_,
chunk_size,
bom_or_sig_available,
strip_sig_or_bom,
sig_payload,
is_multi_byte_decoder,
decoded_payload,
):
md_chunks.append(chunk)
md_ratios.append(
mess_ratio(
chunk,
threshold,
explain is True and 1 <= len(cp_isolation) <= 2,
)
)
if md_ratios[-1] >= threshold:
early_stop_count += 1
if (early_stop_count >= max_chunk_gave_up) or (
bom_or_sig_available and strip_sig_or_bom is False
):
break
except (
UnicodeDecodeError
) as e: # Lazy str loading may have missed something there
logger.log(
TRACE,
"LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
encoding_iana,
str(e),
)
early_stop_count = max_chunk_gave_up
lazy_str_hard_failure = True
# We might want to check the sequence again with the whole content
# Only if initial MD tests passes
if (
not lazy_str_hard_failure
and is_too_large_sequence
and not is_multi_byte_decoder
):
try:
sequences[int(50e3) :].decode(encoding_iana, errors="strict")
except UnicodeDecodeError as e:
logger.log(
TRACE,
"LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s",
encoding_iana,
str(e),
)
tested_but_hard_failure.append(encoding_iana)
continue
mean_mess_ratio: float = sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
tested_but_soft_failure.append(encoding_iana)
logger.log(
TRACE,
"%s was excluded because of initial chaos probing. Gave up %i time(s). "
"Computed mean chaos is %f %%.",
encoding_iana,
early_stop_count,
round(mean_mess_ratio * 100, ndigits=3),
)
# Preparing those fallbacks in case we got nothing.
if (
enable_fallback
and encoding_iana in ["ascii", "utf_8", specified_encoding]
and not lazy_str_hard_failure
):
fallback_entry = CharsetMatch(
sequences,
encoding_iana,
threshold,
False,
[],
decoded_payload,
preemptive_declaration=specified_encoding,
)
if encoding_iana == specified_encoding:
fallback_specified = fallback_entry
elif encoding_iana == "ascii":
fallback_ascii = fallback_entry
else:
fallback_u8 = fallback_entry
continue
logger.log(
TRACE,
"%s passed initial chaos probing. Mean measured chaos is %f %%",
encoding_iana,
round(mean_mess_ratio * 100, ndigits=3),
)
if not is_multi_byte_decoder:
target_languages: List[str] = encoding_languages(encoding_iana)
else:
target_languages = mb_encoding_languages(encoding_iana)
if target_languages:
logger.log(
TRACE,
"{} should target any language(s) of {}".format(
encoding_iana, str(target_languages)
),
)
cd_ratios = []
# We shall skip the CD when its about ASCII
# Most of the time its not relevant to run "language-detection" on it.
if encoding_iana != "ascii":
for chunk in md_chunks:
chunk_languages = coherence_ratio(
chunk,
language_threshold,
",".join(target_languages) if target_languages else None,
)
cd_ratios.append(chunk_languages)
cd_ratios_merged = merge_coherence_ratios(cd_ratios)
if cd_ratios_merged:
logger.log(
TRACE,
"We detected language {} using {}".format(
cd_ratios_merged, encoding_iana
),
)
current_match = CharsetMatch(
sequences,
encoding_iana,
mean_mess_ratio,
bom_or_sig_available,
cd_ratios_merged,
(
decoded_payload
if (
is_too_large_sequence is False
or encoding_iana in [specified_encoding, "ascii", "utf_8"]
)
else None
),
preemptive_declaration=specified_encoding,
)
results.append(current_match)
if (
encoding_iana in [specified_encoding, "ascii", "utf_8"]
and mean_mess_ratio < 0.1
):
# If md says nothing to worry about, then... stop immediately!
if mean_mess_ratio == 0.0:
logger.debug(
"Encoding detection: %s is most likely the one.",
current_match.encoding,
)
if explain:
logger.removeHandler(explain_handler)
logger.setLevel(previous_logger_level)
return CharsetMatches([current_match])
early_stop_results.append(current_match)
if (
len(early_stop_results)
and (specified_encoding is None or specified_encoding in tested)
and "ascii" in tested
and "utf_8" in tested
):
probable_result: CharsetMatch = early_stop_results.best() # type: ignore[assignment]
logger.debug(
"Encoding detection: %s is most likely the one.",
probable_result.encoding,
)
if explain:
logger.removeHandler(explain_handler)
logger.setLevel(previous_logger_level)
return CharsetMatches([probable_result])
if encoding_iana == sig_encoding:
logger.debug(
"Encoding detection: %s is most likely the one as we detected a BOM or SIG within "
"the beginning of the sequence.",
encoding_iana,
)
if explain:
logger.removeHandler(explain_handler)
logger.setLevel(previous_logger_level)
return CharsetMatches([results[encoding_iana]])
if len(results) == 0:
if fallback_u8 or fallback_ascii or fallback_specified:
logger.log(
TRACE,
"Nothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.",
)
if fallback_specified:
logger.debug(
"Encoding detection: %s will be used as a fallback match",
fallback_specified.encoding,
)
results.append(fallback_specified)
elif (
(fallback_u8 and fallback_ascii is None)
or (
fallback_u8
and fallback_ascii
and fallback_u8.fingerprint != fallback_ascii.fingerprint
)
or (fallback_u8 is not None)
):
logger.debug("Encoding detection: utf_8 will be used as a fallback match")
results.append(fallback_u8)
elif fallback_ascii:
logger.debug("Encoding detection: ascii will be used as a fallback match")
results.append(fallback_ascii)
if results:
logger.debug(
"Encoding detection: Found %s as plausible (best-candidate) for content. With %i alternatives.",
results.best().encoding, # type: ignore
len(results) - 1,
)
else:
logger.debug("Encoding detection: Unable to determine any suitable charset.")
if explain:
logger.removeHandler(explain_handler)
logger.setLevel(previous_logger_level)
return results
def from_fp(
fp: BinaryIO,
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.20,
cp_isolation: Optional[List[str]] = None,
cp_exclusion: Optional[List[str]] = None,
preemptive_behaviour: bool = True,
explain: bool = False,
language_threshold: float = 0.1,
enable_fallback: bool = True,
) -> CharsetMatches:
"""
Same thing than the function from_bytes but using a file pointer that is already ready.
Will not close the file pointer.
"""
return from_bytes(
fp.read(),
steps,
chunk_size,
threshold,
cp_isolation,
cp_exclusion,
preemptive_behaviour,
explain,
language_threshold,
enable_fallback,
)
def from_path(
path: Union[str, bytes, PathLike], # type: ignore[type-arg]
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.20,
cp_isolation: Optional[List[str]] = None,
cp_exclusion: Optional[List[str]] = None,
preemptive_behaviour: bool = True,
explain: bool = False,
language_threshold: float = 0.1,
enable_fallback: bool = True,
) -> CharsetMatches:
"""
Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
Can raise IOError.
"""
with open(path, "rb") as fp:
return from_fp(
fp,
steps,
chunk_size,
threshold,
cp_isolation,
cp_exclusion,
preemptive_behaviour,
explain,
language_threshold,
enable_fallback,
)
def is_binary(
fp_or_path_or_payload: Union[PathLike, str, BinaryIO, bytes], # type: ignore[type-arg]
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.20,
cp_isolation: Optional[List[str]] = None,
cp_exclusion: Optional[List[str]] = None,
preemptive_behaviour: bool = True,
explain: bool = False,
language_threshold: float = 0.1,
enable_fallback: bool = False,
) -> bool:
"""
Detect if the given input (file, bytes, or path) points to a binary file. aka. not a string.
Based on the same main heuristic algorithms and default kwargs at the sole exception that fallbacks match
are disabled to be stricter around ASCII-compatible but unlikely to be a string.
"""
if isinstance(fp_or_path_or_payload, (str, PathLike)):
guesses = from_path(
fp_or_path_or_payload,
steps=steps,
chunk_size=chunk_size,
threshold=threshold,
cp_isolation=cp_isolation,
cp_exclusion=cp_exclusion,
preemptive_behaviour=preemptive_behaviour,
explain=explain,
language_threshold=language_threshold,
enable_fallback=enable_fallback,
)
elif isinstance(
fp_or_path_or_payload,
(
bytes,
bytearray,
),
):
guesses = from_bytes(
fp_or_path_or_payload,
steps=steps,
chunk_size=chunk_size,
threshold=threshold,
cp_isolation=cp_isolation,
cp_exclusion=cp_exclusion,
preemptive_behaviour=preemptive_behaviour,
explain=explain,
language_threshold=language_threshold,
enable_fallback=enable_fallback,
)
else:
guesses = from_fp(
fp_or_path_or_payload,
steps=steps,
chunk_size=chunk_size,
threshold=threshold,
cp_isolation=cp_isolation,
cp_exclusion=cp_exclusion,
preemptive_behaviour=preemptive_behaviour,
explain=explain,
language_threshold=language_threshold,
enable_fallback=enable_fallback,
)
return not guesses

View File

@ -0,0 +1,395 @@
import importlib
from codecs import IncrementalDecoder
from collections import Counter
from functools import lru_cache
from typing import Counter as TypeCounter, Dict, List, Optional, Tuple
from .constant import (
FREQUENCIES,
KO_NAMES,
LANGUAGE_SUPPORTED_COUNT,
TOO_SMALL_SEQUENCE,
ZH_NAMES,
)
from .md import is_suspiciously_successive_range
from .models import CoherenceMatches
from .utils import (
is_accentuated,
is_latin,
is_multi_byte_encoding,
is_unicode_range_secondary,
unicode_range,
)
def encoding_unicode_range(iana_name: str) -> List[str]:
"""
Return associated unicode ranges in a single byte code page.
"""
if is_multi_byte_encoding(iana_name):
raise IOError("Function not supported on multi-byte code page")
decoder = importlib.import_module(
"encodings.{}".format(iana_name)
).IncrementalDecoder
p: IncrementalDecoder = decoder(errors="ignore")
seen_ranges: Dict[str, int] = {}
character_count: int = 0
for i in range(0x40, 0xFF):
chunk: str = p.decode(bytes([i]))
if chunk:
character_range: Optional[str] = unicode_range(chunk)
if character_range is None:
continue
if is_unicode_range_secondary(character_range) is False:
if character_range not in seen_ranges:
seen_ranges[character_range] = 0
seen_ranges[character_range] += 1
character_count += 1
return sorted(
[
character_range
for character_range in seen_ranges
if seen_ranges[character_range] / character_count >= 0.15
]
)
def unicode_range_languages(primary_range: str) -> List[str]:
"""
Return inferred languages used with a unicode range.
"""
languages: List[str] = []
for language, characters in FREQUENCIES.items():
for character in characters:
if unicode_range(character) == primary_range:
languages.append(language)
break
return languages
@lru_cache()
def encoding_languages(iana_name: str) -> List[str]:
"""
Single-byte encoding language association. Some code page are heavily linked to particular language(s).
This function does the correspondence.
"""
unicode_ranges: List[str] = encoding_unicode_range(iana_name)
primary_range: Optional[str] = None
for specified_range in unicode_ranges:
if "Latin" not in specified_range:
primary_range = specified_range
break
if primary_range is None:
return ["Latin Based"]
return unicode_range_languages(primary_range)
@lru_cache()
def mb_encoding_languages(iana_name: str) -> List[str]:
"""
Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
This function does the correspondence.
"""
if (
iana_name.startswith("shift_")
or iana_name.startswith("iso2022_jp")
or iana_name.startswith("euc_j")
or iana_name == "cp932"
):
return ["Japanese"]
if iana_name.startswith("gb") or iana_name in ZH_NAMES:
return ["Chinese"]
if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
return ["Korean"]
return []
@lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
def get_target_features(language: str) -> Tuple[bool, bool]:
"""
Determine main aspects from a supported language if it contains accents and if is pure Latin.
"""
target_have_accents: bool = False
target_pure_latin: bool = True
for character in FREQUENCIES[language]:
if not target_have_accents and is_accentuated(character):
target_have_accents = True
if target_pure_latin and is_latin(character) is False:
target_pure_latin = False
return target_have_accents, target_pure_latin
def alphabet_languages(
characters: List[str], ignore_non_latin: bool = False
) -> List[str]:
"""
Return associated languages associated to given characters.
"""
languages: List[Tuple[str, float]] = []
source_have_accents = any(is_accentuated(character) for character in characters)
for language, language_characters in FREQUENCIES.items():
target_have_accents, target_pure_latin = get_target_features(language)
if ignore_non_latin and target_pure_latin is False:
continue
if target_have_accents is False and source_have_accents:
continue
character_count: int = len(language_characters)
character_match_count: int = len(
[c for c in language_characters if c in characters]
)
ratio: float = character_match_count / character_count
if ratio >= 0.2:
languages.append((language, ratio))
languages = sorted(languages, key=lambda x: x[1], reverse=True)
return [compatible_language[0] for compatible_language in languages]
def characters_popularity_compare(
language: str, ordered_characters: List[str]
) -> float:
"""
Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
"""
if language not in FREQUENCIES:
raise ValueError("{} not available".format(language))
character_approved_count: int = 0
FREQUENCIES_language_set = set(FREQUENCIES[language])
ordered_characters_count: int = len(ordered_characters)
target_language_characters_count: int = len(FREQUENCIES[language])
large_alphabet: bool = target_language_characters_count > 26
for character, character_rank in zip(
ordered_characters, range(0, ordered_characters_count)
):
if character not in FREQUENCIES_language_set:
continue
character_rank_in_language: int = FREQUENCIES[language].index(character)
expected_projection_ratio: float = (
target_language_characters_count / ordered_characters_count
)
character_rank_projection: int = int(character_rank * expected_projection_ratio)
if (
large_alphabet is False
and abs(character_rank_projection - character_rank_in_language) > 4
):
continue
if (
large_alphabet is True
and abs(character_rank_projection - character_rank_in_language)
< target_language_characters_count / 3
):
character_approved_count += 1
continue
characters_before_source: List[str] = FREQUENCIES[language][
0:character_rank_in_language
]
characters_after_source: List[str] = FREQUENCIES[language][
character_rank_in_language:
]
characters_before: List[str] = ordered_characters[0:character_rank]
characters_after: List[str] = ordered_characters[character_rank:]
before_match_count: int = len(
set(characters_before) & set(characters_before_source)
)
after_match_count: int = len(
set(characters_after) & set(characters_after_source)
)
if len(characters_before_source) == 0 and before_match_count <= 4:
character_approved_count += 1
continue
if len(characters_after_source) == 0 and after_match_count <= 4:
character_approved_count += 1
continue
if (
before_match_count / len(characters_before_source) >= 0.4
or after_match_count / len(characters_after_source) >= 0.4
):
character_approved_count += 1
continue
return character_approved_count / len(ordered_characters)
def alpha_unicode_split(decoded_sequence: str) -> List[str]:
"""
Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
One containing the latin letters and the other hebrew.
"""
layers: Dict[str, str] = {}
for character in decoded_sequence:
if character.isalpha() is False:
continue
character_range: Optional[str] = unicode_range(character)
if character_range is None:
continue
layer_target_range: Optional[str] = None
for discovered_range in layers:
if (
is_suspiciously_successive_range(discovered_range, character_range)
is False
):
layer_target_range = discovered_range
break
if layer_target_range is None:
layer_target_range = character_range
if layer_target_range not in layers:
layers[layer_target_range] = character.lower()
continue
layers[layer_target_range] += character.lower()
return list(layers.values())
def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
"""
This function merge results previously given by the function coherence_ratio.
The return type is the same as coherence_ratio.
"""
per_language_ratios: Dict[str, List[float]] = {}
for result in results:
for sub_result in result:
language, ratio = sub_result
if language not in per_language_ratios:
per_language_ratios[language] = [ratio]
continue
per_language_ratios[language].append(ratio)
merge = [
(
language,
round(
sum(per_language_ratios[language]) / len(per_language_ratios[language]),
4,
),
)
for language in per_language_ratios
]
return sorted(merge, key=lambda x: x[1], reverse=True)
def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
"""
We shall NOT return "English—" in CoherenceMatches because it is an alternative
of "English". This function only keeps the best match and remove the em-dash in it.
"""
index_results: Dict[str, List[float]] = dict()
for result in results:
language, ratio = result
no_em_name: str = language.replace("", "")
if no_em_name not in index_results:
index_results[no_em_name] = []
index_results[no_em_name].append(ratio)
if any(len(index_results[e]) > 1 for e in index_results):
filtered_results: CoherenceMatches = []
for language in index_results:
filtered_results.append((language, max(index_results[language])))
return filtered_results
return results
@lru_cache(maxsize=2048)
def coherence_ratio(
decoded_sequence: str, threshold: float = 0.1, lg_inclusion: Optional[str] = None
) -> CoherenceMatches:
"""
Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
A layer = Character extraction by alphabets/ranges.
"""
results: List[Tuple[str, float]] = []
ignore_non_latin: bool = False
sufficient_match_count: int = 0
lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
if "Latin Based" in lg_inclusion_list:
ignore_non_latin = True
lg_inclusion_list.remove("Latin Based")
for layer in alpha_unicode_split(decoded_sequence):
sequence_frequencies: TypeCounter[str] = Counter(layer)
most_common = sequence_frequencies.most_common()
character_count: int = sum(o for c, o in most_common)
if character_count <= TOO_SMALL_SEQUENCE:
continue
popular_character_ordered: List[str] = [c for c, o in most_common]
for language in lg_inclusion_list or alphabet_languages(
popular_character_ordered, ignore_non_latin
):
ratio: float = characters_popularity_compare(
language, popular_character_ordered
)
if ratio < threshold:
continue
elif ratio >= 0.8:
sufficient_match_count += 1
results.append((language, round(ratio, 4)))
if sufficient_match_count >= 3:
break
return sorted(
filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True
)

View File

@ -0,0 +1,6 @@
from .__main__ import cli_detect, query_yes_no
__all__ = (
"cli_detect",
"query_yes_no",
)

View File

@ -0,0 +1,320 @@
import argparse
import sys
from json import dumps
from os.path import abspath, basename, dirname, join, realpath
from platform import python_version
from typing import List, Optional
from unicodedata import unidata_version
import charset_normalizer.md as md_module
from charset_normalizer import from_fp
from charset_normalizer.models import CliDetectionResult
from charset_normalizer.version import __version__
def query_yes_no(question: str, default: str = "yes") -> bool:
"""Ask a yes/no question via input() and return their answer.
"question" is a string that is presented to the user.
"default" is the presumed answer if the user just hits <Enter>.
It must be "yes" (the default), "no" or None (meaning
an answer is required of the user).
The "answer" return value is True for "yes" or False for "no".
Credit goes to (c) https://stackoverflow.com/questions/3041986/apt-command-line-interface-like-yes-no-input
"""
valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False}
if default is None:
prompt = " [y/n] "
elif default == "yes":
prompt = " [Y/n] "
elif default == "no":
prompt = " [y/N] "
else:
raise ValueError("invalid default answer: '%s'" % default)
while True:
sys.stdout.write(question + prompt)
choice = input().lower()
if default is not None and choice == "":
return valid[default]
elif choice in valid:
return valid[choice]
else:
sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n")
def cli_detect(argv: Optional[List[str]] = None) -> int:
"""
CLI assistant using ARGV and ArgumentParser
:param argv:
:return: 0 if everything is fine, anything else equal trouble
"""
parser = argparse.ArgumentParser(
description="The Real First Universal Charset Detector. "
"Discover originating encoding used on text file. "
"Normalize text to unicode."
)
parser.add_argument(
"files", type=argparse.FileType("rb"), nargs="+", help="File(s) to be analysed"
)
parser.add_argument(
"-v",
"--verbose",
action="store_true",
default=False,
dest="verbose",
help="Display complementary information about file if any. "
"Stdout will contain logs about the detection process.",
)
parser.add_argument(
"-a",
"--with-alternative",
action="store_true",
default=False,
dest="alternatives",
help="Output complementary possibilities if any. Top-level JSON WILL be a list.",
)
parser.add_argument(
"-n",
"--normalize",
action="store_true",
default=False,
dest="normalize",
help="Permit to normalize input file. If not set, program does not write anything.",
)
parser.add_argument(
"-m",
"--minimal",
action="store_true",
default=False,
dest="minimal",
help="Only output the charset detected to STDOUT. Disabling JSON output.",
)
parser.add_argument(
"-r",
"--replace",
action="store_true",
default=False,
dest="replace",
help="Replace file when trying to normalize it instead of creating a new one.",
)
parser.add_argument(
"-f",
"--force",
action="store_true",
default=False,
dest="force",
help="Replace file without asking if you are sure, use this flag with caution.",
)
parser.add_argument(
"-i",
"--no-preemptive",
action="store_true",
default=False,
dest="no_preemptive",
help="Disable looking at a charset declaration to hint the detector.",
)
parser.add_argument(
"-t",
"--threshold",
action="store",
default=0.2,
type=float,
dest="threshold",
help="Define a custom maximum amount of chaos allowed in decoded content. 0. <= chaos <= 1.",
)
parser.add_argument(
"--version",
action="version",
version="Charset-Normalizer {} - Python {} - Unicode {} - SpeedUp {}".format(
__version__,
python_version(),
unidata_version,
"OFF" if md_module.__file__.lower().endswith(".py") else "ON",
),
help="Show version information and exit.",
)
args = parser.parse_args(argv)
if args.replace is True and args.normalize is False:
if args.files:
for my_file in args.files:
my_file.close()
print("Use --replace in addition of --normalize only.", file=sys.stderr)
return 1
if args.force is True and args.replace is False:
if args.files:
for my_file in args.files:
my_file.close()
print("Use --force in addition of --replace only.", file=sys.stderr)
return 1
if args.threshold < 0.0 or args.threshold > 1.0:
if args.files:
for my_file in args.files:
my_file.close()
print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr)
return 1
x_ = []
for my_file in args.files:
matches = from_fp(
my_file,
threshold=args.threshold,
explain=args.verbose,
preemptive_behaviour=args.no_preemptive is False,
)
best_guess = matches.best()
if best_guess is None:
print(
'Unable to identify originating encoding for "{}". {}'.format(
my_file.name,
(
"Maybe try increasing maximum amount of chaos."
if args.threshold < 1.0
else ""
),
),
file=sys.stderr,
)
x_.append(
CliDetectionResult(
abspath(my_file.name),
None,
[],
[],
"Unknown",
[],
False,
1.0,
0.0,
None,
True,
)
)
else:
x_.append(
CliDetectionResult(
abspath(my_file.name),
best_guess.encoding,
best_guess.encoding_aliases,
[
cp
for cp in best_guess.could_be_from_charset
if cp != best_guess.encoding
],
best_guess.language,
best_guess.alphabets,
best_guess.bom,
best_guess.percent_chaos,
best_guess.percent_coherence,
None,
True,
)
)
if len(matches) > 1 and args.alternatives:
for el in matches:
if el != best_guess:
x_.append(
CliDetectionResult(
abspath(my_file.name),
el.encoding,
el.encoding_aliases,
[
cp
for cp in el.could_be_from_charset
if cp != el.encoding
],
el.language,
el.alphabets,
el.bom,
el.percent_chaos,
el.percent_coherence,
None,
False,
)
)
if args.normalize is True:
if best_guess.encoding.startswith("utf") is True:
print(
'"{}" file does not need to be normalized, as it already came from unicode.'.format(
my_file.name
),
file=sys.stderr,
)
if my_file.closed is False:
my_file.close()
continue
dir_path = dirname(realpath(my_file.name))
file_name = basename(realpath(my_file.name))
o_: List[str] = file_name.split(".")
if args.replace is False:
o_.insert(-1, best_guess.encoding)
if my_file.closed is False:
my_file.close()
elif (
args.force is False
and query_yes_no(
'Are you sure to normalize "{}" by replacing it ?'.format(
my_file.name
),
"no",
)
is False
):
if my_file.closed is False:
my_file.close()
continue
try:
x_[0].unicode_path = join(dir_path, ".".join(o_))
with open(x_[0].unicode_path, "wb") as fp:
fp.write(best_guess.output())
except IOError as e:
print(str(e), file=sys.stderr)
if my_file.closed is False:
my_file.close()
return 2
if my_file.closed is False:
my_file.close()
if args.minimal is False:
print(
dumps(
[el.__dict__ for el in x_] if len(x_) > 1 else x_[0].__dict__,
ensure_ascii=True,
indent=4,
)
)
else:
for my_file in args.files:
print(
", ".join(
[
el.encoding or "undefined"
for el in x_
if el.path == abspath(my_file.name)
]
)
)
return 0
if __name__ == "__main__":
cli_detect()

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,65 @@
from __future__ import annotations
from typing import TYPE_CHECKING, Any, Optional
from warnings import warn
from .api import from_bytes
from .constant import CHARDET_CORRESPONDENCE
# TODO: remove this check when dropping Python 3.7 support
if TYPE_CHECKING:
from typing_extensions import TypedDict
class ResultDict(TypedDict):
encoding: Optional[str]
language: str
confidence: Optional[float]
def detect(
byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any
) -> ResultDict:
"""
chardet legacy method
Detect the encoding of the given byte string. It should be mostly backward-compatible.
Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it)
This function is deprecated and should be used to migrate your project easily, consult the documentation for
further information. Not planned for removal.
:param byte_str: The byte sequence to examine.
:param should_rename_legacy: Should we rename legacy encodings
to their more modern equivalents?
"""
if len(kwargs):
warn(
f"charset-normalizer disregard arguments '{','.join(list(kwargs.keys()))}' in legacy function detect()"
)
if not isinstance(byte_str, (bytearray, bytes)):
raise TypeError( # pragma: nocover
"Expected object of type bytes or bytearray, got: "
"{0}".format(type(byte_str))
)
if isinstance(byte_str, bytearray):
byte_str = bytes(byte_str)
r = from_bytes(byte_str).best()
encoding = r.encoding if r is not None else None
language = r.language if r is not None and r.language != "Unknown" else ""
confidence = 1.0 - r.chaos if r is not None else None
# Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
# but chardet does return 'utf-8-sig' and it is a valid codec name.
if r is not None and encoding == "utf_8" and r.bom:
encoding += "_sig"
if should_rename_legacy is False and encoding in CHARDET_CORRESPONDENCE:
encoding = CHARDET_CORRESPONDENCE[encoding]
return {
"encoding": encoding,
"language": language,
"confidence": confidence,
}

View File

@ -0,0 +1,628 @@
from functools import lru_cache
from logging import getLogger
from typing import List, Optional
from .constant import (
COMMON_SAFE_ASCII_CHARACTERS,
TRACE,
UNICODE_SECONDARY_RANGE_KEYWORD,
)
from .utils import (
is_accentuated,
is_arabic,
is_arabic_isolated_form,
is_case_variable,
is_cjk,
is_emoticon,
is_hangul,
is_hiragana,
is_katakana,
is_latin,
is_punctuation,
is_separator,
is_symbol,
is_thai,
is_unprintable,
remove_accent,
unicode_range,
)
class MessDetectorPlugin:
"""
Base abstract class used for mess detection plugins.
All detectors MUST extend and implement given methods.
"""
def eligible(self, character: str) -> bool:
"""
Determine if given character should be fed in.
"""
raise NotImplementedError # pragma: nocover
def feed(self, character: str) -> None:
"""
The main routine to be executed upon character.
Insert the logic in witch the text would be considered chaotic.
"""
raise NotImplementedError # pragma: nocover
def reset(self) -> None: # pragma: no cover
"""
Permit to reset the plugin to the initial state.
"""
raise NotImplementedError
@property
def ratio(self) -> float:
"""
Compute the chaos ratio based on what your feed() has seen.
Must NOT be lower than 0.; No restriction gt 0.
"""
raise NotImplementedError # pragma: nocover
class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._punctuation_count: int = 0
self._symbol_count: int = 0
self._character_count: int = 0
self._last_printable_char: Optional[str] = None
self._frenzy_symbol_in_word: bool = False
def eligible(self, character: str) -> bool:
return character.isprintable()
def feed(self, character: str) -> None:
self._character_count += 1
if (
character != self._last_printable_char
and character not in COMMON_SAFE_ASCII_CHARACTERS
):
if is_punctuation(character):
self._punctuation_count += 1
elif (
character.isdigit() is False
and is_symbol(character)
and is_emoticon(character) is False
):
self._symbol_count += 2
self._last_printable_char = character
def reset(self) -> None: # pragma: no cover
self._punctuation_count = 0
self._character_count = 0
self._symbol_count = 0
@property
def ratio(self) -> float:
if self._character_count == 0:
return 0.0
ratio_of_punctuation: float = (
self._punctuation_count + self._symbol_count
) / self._character_count
return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
class TooManyAccentuatedPlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._character_count: int = 0
self._accentuated_count: int = 0
def eligible(self, character: str) -> bool:
return character.isalpha()
def feed(self, character: str) -> None:
self._character_count += 1
if is_accentuated(character):
self._accentuated_count += 1
def reset(self) -> None: # pragma: no cover
self._character_count = 0
self._accentuated_count = 0
@property
def ratio(self) -> float:
if self._character_count < 8:
return 0.0
ratio_of_accentuation: float = self._accentuated_count / self._character_count
return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
class UnprintablePlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._unprintable_count: int = 0
self._character_count: int = 0
def eligible(self, character: str) -> bool:
return True
def feed(self, character: str) -> None:
if is_unprintable(character):
self._unprintable_count += 1
self._character_count += 1
def reset(self) -> None: # pragma: no cover
self._unprintable_count = 0
@property
def ratio(self) -> float:
if self._character_count == 0:
return 0.0
return (self._unprintable_count * 8) / self._character_count
class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._successive_count: int = 0
self._character_count: int = 0
self._last_latin_character: Optional[str] = None
def eligible(self, character: str) -> bool:
return character.isalpha() and is_latin(character)
def feed(self, character: str) -> None:
self._character_count += 1
if (
self._last_latin_character is not None
and is_accentuated(character)
and is_accentuated(self._last_latin_character)
):
if character.isupper() and self._last_latin_character.isupper():
self._successive_count += 1
# Worse if its the same char duplicated with different accent.
if remove_accent(character) == remove_accent(self._last_latin_character):
self._successive_count += 1
self._last_latin_character = character
def reset(self) -> None: # pragma: no cover
self._successive_count = 0
self._character_count = 0
self._last_latin_character = None
@property
def ratio(self) -> float:
if self._character_count == 0:
return 0.0
return (self._successive_count * 2) / self._character_count
class SuspiciousRange(MessDetectorPlugin):
def __init__(self) -> None:
self._suspicious_successive_range_count: int = 0
self._character_count: int = 0
self._last_printable_seen: Optional[str] = None
def eligible(self, character: str) -> bool:
return character.isprintable()
def feed(self, character: str) -> None:
self._character_count += 1
if (
character.isspace()
or is_punctuation(character)
or character in COMMON_SAFE_ASCII_CHARACTERS
):
self._last_printable_seen = None
return
if self._last_printable_seen is None:
self._last_printable_seen = character
return
unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen)
unicode_range_b: Optional[str] = unicode_range(character)
if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
self._suspicious_successive_range_count += 1
self._last_printable_seen = character
def reset(self) -> None: # pragma: no cover
self._character_count = 0
self._suspicious_successive_range_count = 0
self._last_printable_seen = None
@property
def ratio(self) -> float:
if self._character_count <= 13:
return 0.0
ratio_of_suspicious_range_usage: float = (
self._suspicious_successive_range_count * 2
) / self._character_count
return ratio_of_suspicious_range_usage
class SuperWeirdWordPlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._word_count: int = 0
self._bad_word_count: int = 0
self._foreign_long_count: int = 0
self._is_current_word_bad: bool = False
self._foreign_long_watch: bool = False
self._character_count: int = 0
self._bad_character_count: int = 0
self._buffer: str = ""
self._buffer_accent_count: int = 0
self._buffer_glyph_count: int = 0
def eligible(self, character: str) -> bool:
return True
def feed(self, character: str) -> None:
if character.isalpha():
self._buffer += character
if is_accentuated(character):
self._buffer_accent_count += 1
if (
self._foreign_long_watch is False
and (is_latin(character) is False or is_accentuated(character))
and is_cjk(character) is False
and is_hangul(character) is False
and is_katakana(character) is False
and is_hiragana(character) is False
and is_thai(character) is False
):
self._foreign_long_watch = True
if (
is_cjk(character)
or is_hangul(character)
or is_katakana(character)
or is_hiragana(character)
or is_thai(character)
):
self._buffer_glyph_count += 1
return
if not self._buffer:
return
if (
character.isspace() or is_punctuation(character) or is_separator(character)
) and self._buffer:
self._word_count += 1
buffer_length: int = len(self._buffer)
self._character_count += buffer_length
if buffer_length >= 4:
if self._buffer_accent_count / buffer_length >= 0.5:
self._is_current_word_bad = True
# Word/Buffer ending with an upper case accentuated letter are so rare,
# that we will consider them all as suspicious. Same weight as foreign_long suspicious.
elif (
is_accentuated(self._buffer[-1])
and self._buffer[-1].isupper()
and all(_.isupper() for _ in self._buffer) is False
):
self._foreign_long_count += 1
self._is_current_word_bad = True
elif self._buffer_glyph_count == 1:
self._is_current_word_bad = True
self._foreign_long_count += 1
if buffer_length >= 24 and self._foreign_long_watch:
camel_case_dst = [
i
for c, i in zip(self._buffer, range(0, buffer_length))
if c.isupper()
]
probable_camel_cased: bool = False
if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3):
probable_camel_cased = True
if not probable_camel_cased:
self._foreign_long_count += 1
self._is_current_word_bad = True
if self._is_current_word_bad:
self._bad_word_count += 1
self._bad_character_count += len(self._buffer)
self._is_current_word_bad = False
self._foreign_long_watch = False
self._buffer = ""
self._buffer_accent_count = 0
self._buffer_glyph_count = 0
elif (
character not in {"<", ">", "-", "=", "~", "|", "_"}
and character.isdigit() is False
and is_symbol(character)
):
self._is_current_word_bad = True
self._buffer += character
def reset(self) -> None: # pragma: no cover
self._buffer = ""
self._is_current_word_bad = False
self._foreign_long_watch = False
self._bad_word_count = 0
self._word_count = 0
self._character_count = 0
self._bad_character_count = 0
self._foreign_long_count = 0
@property
def ratio(self) -> float:
if self._word_count <= 10 and self._foreign_long_count == 0:
return 0.0
return self._bad_character_count / self._character_count
class CjkInvalidStopPlugin(MessDetectorPlugin):
"""
GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
can be easily detected. Searching for the overuse of '' and ''.
"""
def __init__(self) -> None:
self._wrong_stop_count: int = 0
self._cjk_character_count: int = 0
def eligible(self, character: str) -> bool:
return True
def feed(self, character: str) -> None:
if character in {"", ""}:
self._wrong_stop_count += 1
return
if is_cjk(character):
self._cjk_character_count += 1
def reset(self) -> None: # pragma: no cover
self._wrong_stop_count = 0
self._cjk_character_count = 0
@property
def ratio(self) -> float:
if self._cjk_character_count < 16:
return 0.0
return self._wrong_stop_count / self._cjk_character_count
class ArchaicUpperLowerPlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._buf: bool = False
self._character_count_since_last_sep: int = 0
self._successive_upper_lower_count: int = 0
self._successive_upper_lower_count_final: int = 0
self._character_count: int = 0
self._last_alpha_seen: Optional[str] = None
self._current_ascii_only: bool = True
def eligible(self, character: str) -> bool:
return True
def feed(self, character: str) -> None:
is_concerned = character.isalpha() and is_case_variable(character)
chunk_sep = is_concerned is False
if chunk_sep and self._character_count_since_last_sep > 0:
if (
self._character_count_since_last_sep <= 64
and character.isdigit() is False
and self._current_ascii_only is False
):
self._successive_upper_lower_count_final += (
self._successive_upper_lower_count
)
self._successive_upper_lower_count = 0
self._character_count_since_last_sep = 0
self._last_alpha_seen = None
self._buf = False
self._character_count += 1
self._current_ascii_only = True
return
if self._current_ascii_only is True and character.isascii() is False:
self._current_ascii_only = False
if self._last_alpha_seen is not None:
if (character.isupper() and self._last_alpha_seen.islower()) or (
character.islower() and self._last_alpha_seen.isupper()
):
if self._buf is True:
self._successive_upper_lower_count += 2
self._buf = False
else:
self._buf = True
else:
self._buf = False
self._character_count += 1
self._character_count_since_last_sep += 1
self._last_alpha_seen = character
def reset(self) -> None: # pragma: no cover
self._character_count = 0
self._character_count_since_last_sep = 0
self._successive_upper_lower_count = 0
self._successive_upper_lower_count_final = 0
self._last_alpha_seen = None
self._buf = False
self._current_ascii_only = True
@property
def ratio(self) -> float:
if self._character_count == 0:
return 0.0
return self._successive_upper_lower_count_final / self._character_count
class ArabicIsolatedFormPlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._character_count: int = 0
self._isolated_form_count: int = 0
def reset(self) -> None: # pragma: no cover
self._character_count = 0
self._isolated_form_count = 0
def eligible(self, character: str) -> bool:
return is_arabic(character)
def feed(self, character: str) -> None:
self._character_count += 1
if is_arabic_isolated_form(character):
self._isolated_form_count += 1
@property
def ratio(self) -> float:
if self._character_count < 8:
return 0.0
isolated_form_usage: float = self._isolated_form_count / self._character_count
return isolated_form_usage
@lru_cache(maxsize=1024)
def is_suspiciously_successive_range(
unicode_range_a: Optional[str], unicode_range_b: Optional[str]
) -> bool:
"""
Determine if two Unicode range seen next to each other can be considered as suspicious.
"""
if unicode_range_a is None or unicode_range_b is None:
return True
if unicode_range_a == unicode_range_b:
return False
if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
return False
if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
return False
# Latin characters can be accompanied with a combining diacritical mark
# eg. Vietnamese.
if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
"Combining" in unicode_range_a or "Combining" in unicode_range_b
):
return False
keywords_range_a, keywords_range_b = unicode_range_a.split(
" "
), unicode_range_b.split(" ")
for el in keywords_range_a:
if el in UNICODE_SECONDARY_RANGE_KEYWORD:
continue
if el in keywords_range_b:
return False
# Japanese Exception
range_a_jp_chars, range_b_jp_chars = (
unicode_range_a
in (
"Hiragana",
"Katakana",
),
unicode_range_b in ("Hiragana", "Katakana"),
)
if (range_a_jp_chars or range_b_jp_chars) and (
"CJK" in unicode_range_a or "CJK" in unicode_range_b
):
return False
if range_a_jp_chars and range_b_jp_chars:
return False
if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
return False
if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
return False
# Chinese/Japanese use dedicated range for punctuation and/or separators.
if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
unicode_range_a in ["Katakana", "Hiragana"]
and unicode_range_b in ["Katakana", "Hiragana"]
):
if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
return False
if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
return False
if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
return False
return True
@lru_cache(maxsize=2048)
def mess_ratio(
decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
) -> float:
"""
Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
"""
detectors: List[MessDetectorPlugin] = [
md_class() for md_class in MessDetectorPlugin.__subclasses__()
]
length: int = len(decoded_sequence) + 1
mean_mess_ratio: float = 0.0
if length < 512:
intermediary_mean_mess_ratio_calc: int = 32
elif length <= 1024:
intermediary_mean_mess_ratio_calc = 64
else:
intermediary_mean_mess_ratio_calc = 128
for character, index in zip(decoded_sequence + "\n", range(length)):
for detector in detectors:
if detector.eligible(character):
detector.feed(character)
if (
index > 0 and index % intermediary_mean_mess_ratio_calc == 0
) or index == length - 1:
mean_mess_ratio = sum(dt.ratio for dt in detectors)
if mean_mess_ratio >= maximum_threshold:
break
if debug:
logger = getLogger("charset_normalizer")
logger.log(
TRACE,
"Mess-detector extended-analysis start. "
f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} "
f"maximum_threshold={maximum_threshold}",
)
if len(decoded_sequence) > 16:
logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
for dt in detectors: # pragma: nocover
logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
return round(mean_mess_ratio, 3)

View File

@ -0,0 +1,359 @@
from encodings.aliases import aliases
from hashlib import sha256
from json import dumps
from re import sub
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
from .constant import RE_POSSIBLE_ENCODING_INDICATION, TOO_BIG_SEQUENCE
from .utils import iana_name, is_multi_byte_encoding, unicode_range
class CharsetMatch:
def __init__(
self,
payload: bytes,
guessed_encoding: str,
mean_mess_ratio: float,
has_sig_or_bom: bool,
languages: "CoherenceMatches",
decoded_payload: Optional[str] = None,
preemptive_declaration: Optional[str] = None,
):
self._payload: bytes = payload
self._encoding: str = guessed_encoding
self._mean_mess_ratio: float = mean_mess_ratio
self._languages: CoherenceMatches = languages
self._has_sig_or_bom: bool = has_sig_or_bom
self._unicode_ranges: Optional[List[str]] = None
self._leaves: List[CharsetMatch] = []
self._mean_coherence_ratio: float = 0.0
self._output_payload: Optional[bytes] = None
self._output_encoding: Optional[str] = None
self._string: Optional[str] = decoded_payload
self._preemptive_declaration: Optional[str] = preemptive_declaration
def __eq__(self, other: object) -> bool:
if not isinstance(other, CharsetMatch):
if isinstance(other, str):
return iana_name(other) == self.encoding
return False
return self.encoding == other.encoding and self.fingerprint == other.fingerprint
def __lt__(self, other: object) -> bool:
"""
Implemented to make sorted available upon CharsetMatches items.
"""
if not isinstance(other, CharsetMatch):
raise ValueError
chaos_difference: float = abs(self.chaos - other.chaos)
coherence_difference: float = abs(self.coherence - other.coherence)
# Below 1% difference --> Use Coherence
if chaos_difference < 0.01 and coherence_difference > 0.02:
return self.coherence > other.coherence
elif chaos_difference < 0.01 and coherence_difference <= 0.02:
# When having a difficult decision, use the result that decoded as many multi-byte as possible.
# preserve RAM usage!
if len(self._payload) >= TOO_BIG_SEQUENCE:
return self.chaos < other.chaos
return self.multi_byte_usage > other.multi_byte_usage
return self.chaos < other.chaos
@property
def multi_byte_usage(self) -> float:
return 1.0 - (len(str(self)) / len(self.raw))
def __str__(self) -> str:
# Lazy Str Loading
if self._string is None:
self._string = str(self._payload, self._encoding, "strict")
return self._string
def __repr__(self) -> str:
return "<CharsetMatch '{}' bytes({})>".format(self.encoding, self.fingerprint)
def add_submatch(self, other: "CharsetMatch") -> None:
if not isinstance(other, CharsetMatch) or other == self:
raise ValueError(
"Unable to add instance <{}> as a submatch of a CharsetMatch".format(
other.__class__
)
)
other._string = None # Unload RAM usage; dirty trick.
self._leaves.append(other)
@property
def encoding(self) -> str:
return self._encoding
@property
def encoding_aliases(self) -> List[str]:
"""
Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
"""
also_known_as: List[str] = []
for u, p in aliases.items():
if self.encoding == u:
also_known_as.append(p)
elif self.encoding == p:
also_known_as.append(u)
return also_known_as
@property
def bom(self) -> bool:
return self._has_sig_or_bom
@property
def byte_order_mark(self) -> bool:
return self._has_sig_or_bom
@property
def languages(self) -> List[str]:
"""
Return the complete list of possible languages found in decoded sequence.
Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
"""
return [e[0] for e in self._languages]
@property
def language(self) -> str:
"""
Most probable language found in decoded sequence. If none were detected or inferred, the property will return
"Unknown".
"""
if not self._languages:
# Trying to infer the language based on the given encoding
# Its either English or we should not pronounce ourselves in certain cases.
if "ascii" in self.could_be_from_charset:
return "English"
# doing it there to avoid circular import
from charset_normalizer.cd import encoding_languages, mb_encoding_languages
languages = (
mb_encoding_languages(self.encoding)
if is_multi_byte_encoding(self.encoding)
else encoding_languages(self.encoding)
)
if len(languages) == 0 or "Latin Based" in languages:
return "Unknown"
return languages[0]
return self._languages[0][0]
@property
def chaos(self) -> float:
return self._mean_mess_ratio
@property
def coherence(self) -> float:
if not self._languages:
return 0.0
return self._languages[0][1]
@property
def percent_chaos(self) -> float:
return round(self.chaos * 100, ndigits=3)
@property
def percent_coherence(self) -> float:
return round(self.coherence * 100, ndigits=3)
@property
def raw(self) -> bytes:
"""
Original untouched bytes.
"""
return self._payload
@property
def submatch(self) -> List["CharsetMatch"]:
return self._leaves
@property
def has_submatch(self) -> bool:
return len(self._leaves) > 0
@property
def alphabets(self) -> List[str]:
if self._unicode_ranges is not None:
return self._unicode_ranges
# list detected ranges
detected_ranges: List[Optional[str]] = [
unicode_range(char) for char in str(self)
]
# filter and sort
self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
return self._unicode_ranges
@property
def could_be_from_charset(self) -> List[str]:
"""
The complete list of encoding that output the exact SAME str result and therefore could be the originating
encoding.
This list does include the encoding available in property 'encoding'.
"""
return [self._encoding] + [m.encoding for m in self._leaves]
def output(self, encoding: str = "utf_8") -> bytes:
"""
Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
Any errors will be simply ignored by the encoder NOT replaced.
"""
if self._output_encoding is None or self._output_encoding != encoding:
self._output_encoding = encoding
decoded_string = str(self)
if (
self._preemptive_declaration is not None
and self._preemptive_declaration.lower()
not in ["utf-8", "utf8", "utf_8"]
):
patched_header = sub(
RE_POSSIBLE_ENCODING_INDICATION,
lambda m: m.string[m.span()[0] : m.span()[1]].replace(
m.groups()[0], iana_name(self._output_encoding) # type: ignore[arg-type]
),
decoded_string[:8192],
1,
)
decoded_string = patched_header + decoded_string[8192:]
self._output_payload = decoded_string.encode(encoding, "replace")
return self._output_payload # type: ignore
@property
def fingerprint(self) -> str:
"""
Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.
"""
return sha256(self.output()).hexdigest()
class CharsetMatches:
"""
Container with every CharsetMatch items ordered by default from most probable to the less one.
Act like a list(iterable) but does not implements all related methods.
"""
def __init__(self, results: Optional[List[CharsetMatch]] = None):
self._results: List[CharsetMatch] = sorted(results) if results else []
def __iter__(self) -> Iterator[CharsetMatch]:
yield from self._results
def __getitem__(self, item: Union[int, str]) -> CharsetMatch:
"""
Retrieve a single item either by its position or encoding name (alias may be used here).
Raise KeyError upon invalid index or encoding not present in results.
"""
if isinstance(item, int):
return self._results[item]
if isinstance(item, str):
item = iana_name(item, False)
for result in self._results:
if item in result.could_be_from_charset:
return result
raise KeyError
def __len__(self) -> int:
return len(self._results)
def __bool__(self) -> bool:
return len(self._results) > 0
def append(self, item: CharsetMatch) -> None:
"""
Insert a single match. Will be inserted accordingly to preserve sort.
Can be inserted as a submatch.
"""
if not isinstance(item, CharsetMatch):
raise ValueError(
"Cannot append instance '{}' to CharsetMatches".format(
str(item.__class__)
)
)
# We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
if len(item.raw) < TOO_BIG_SEQUENCE:
for match in self._results:
if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
match.add_submatch(item)
return
self._results.append(item)
self._results = sorted(self._results)
def best(self) -> Optional["CharsetMatch"]:
"""
Simply return the first match. Strict equivalent to matches[0].
"""
if not self._results:
return None
return self._results[0]
def first(self) -> Optional["CharsetMatch"]:
"""
Redundant method, call the method best(). Kept for BC reasons.
"""
return self.best()
CoherenceMatch = Tuple[str, float]
CoherenceMatches = List[CoherenceMatch]
class CliDetectionResult:
def __init__(
self,
path: str,
encoding: Optional[str],
encoding_aliases: List[str],
alternative_encodings: List[str],
language: str,
alphabets: List[str],
has_sig_or_bom: bool,
chaos: float,
coherence: float,
unicode_path: Optional[str],
is_preferred: bool,
):
self.path: str = path
self.unicode_path: Optional[str] = unicode_path
self.encoding: Optional[str] = encoding
self.encoding_aliases: List[str] = encoding_aliases
self.alternative_encodings: List[str] = alternative_encodings
self.language: str = language
self.alphabets: List[str] = alphabets
self.has_sig_or_bom: bool = has_sig_or_bom
self.chaos: float = chaos
self.coherence: float = coherence
self.is_preferred: bool = is_preferred
@property
def __dict__(self) -> Dict[str, Any]: # type: ignore
return {
"path": self.path,
"encoding": self.encoding,
"encoding_aliases": self.encoding_aliases,
"alternative_encodings": self.alternative_encodings,
"language": self.language,
"alphabets": self.alphabets,
"has_sig_or_bom": self.has_sig_or_bom,
"chaos": self.chaos,
"coherence": self.coherence,
"unicode_path": self.unicode_path,
"is_preferred": self.is_preferred,
}
def to_json(self) -> str:
return dumps(self.__dict__, ensure_ascii=True, indent=4)

View File

@ -0,0 +1,421 @@
import importlib
import logging
import unicodedata
from codecs import IncrementalDecoder
from encodings.aliases import aliases
from functools import lru_cache
from re import findall
from typing import Generator, List, Optional, Set, Tuple, Union
from _multibytecodec import MultibyteIncrementalDecoder
from .constant import (
ENCODING_MARKS,
IANA_SUPPORTED_SIMILAR,
RE_POSSIBLE_ENCODING_INDICATION,
UNICODE_RANGES_COMBINED,
UNICODE_SECONDARY_RANGE_KEYWORD,
UTF8_MAXIMAL_ALLOCATION,
)
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_accentuated(character: str) -> bool:
try:
description: str = unicodedata.name(character)
except ValueError:
return False
return (
"WITH GRAVE" in description
or "WITH ACUTE" in description
or "WITH CEDILLA" in description
or "WITH DIAERESIS" in description
or "WITH CIRCUMFLEX" in description
or "WITH TILDE" in description
or "WITH MACRON" in description
or "WITH RING ABOVE" in description
)
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def remove_accent(character: str) -> str:
decomposed: str = unicodedata.decomposition(character)
if not decomposed:
return character
codes: List[str] = decomposed.split(" ")
return chr(int(codes[0], 16))
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def unicode_range(character: str) -> Optional[str]:
"""
Retrieve the Unicode range official name from a single character.
"""
character_ord: int = ord(character)
for range_name, ord_range in UNICODE_RANGES_COMBINED.items():
if character_ord in ord_range:
return range_name
return None
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_latin(character: str) -> bool:
try:
description: str = unicodedata.name(character)
except ValueError:
return False
return "LATIN" in description
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_punctuation(character: str) -> bool:
character_category: str = unicodedata.category(character)
if "P" in character_category:
return True
character_range: Optional[str] = unicode_range(character)
if character_range is None:
return False
return "Punctuation" in character_range
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_symbol(character: str) -> bool:
character_category: str = unicodedata.category(character)
if "S" in character_category or "N" in character_category:
return True
character_range: Optional[str] = unicode_range(character)
if character_range is None:
return False
return "Forms" in character_range and character_category != "Lo"
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_emoticon(character: str) -> bool:
character_range: Optional[str] = unicode_range(character)
if character_range is None:
return False
return "Emoticons" in character_range or "Pictographs" in character_range
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_separator(character: str) -> bool:
if character.isspace() or character in {"", "+", "<", ">"}:
return True
character_category: str = unicodedata.category(character)
return "Z" in character_category or character_category in {"Po", "Pd", "Pc"}
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_case_variable(character: str) -> bool:
return character.islower() != character.isupper()
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_cjk(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError:
return False
return "CJK" in character_name
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_hiragana(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError:
return False
return "HIRAGANA" in character_name
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_katakana(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError:
return False
return "KATAKANA" in character_name
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_hangul(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError:
return False
return "HANGUL" in character_name
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_thai(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError:
return False
return "THAI" in character_name
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_arabic(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError:
return False
return "ARABIC" in character_name
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_arabic_isolated_form(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError:
return False
return "ARABIC" in character_name and "ISOLATED FORM" in character_name
@lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
def is_unicode_range_secondary(range_name: str) -> bool:
return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_unprintable(character: str) -> bool:
return (
character.isspace() is False # includes \n \t \r \v
and character.isprintable() is False
and character != "\x1A" # Why? Its the ASCII substitute character.
and character != "\ufeff" # bug discovered in Python,
# Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.
)
def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> Optional[str]:
"""
Extract using ASCII-only decoder any specified encoding in the first n-bytes.
"""
if not isinstance(sequence, bytes):
raise TypeError
seq_len: int = len(sequence)
results: List[str] = findall(
RE_POSSIBLE_ENCODING_INDICATION,
sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
)
if len(results) == 0:
return None
for specified_encoding in results:
specified_encoding = specified_encoding.lower().replace("-", "_")
encoding_alias: str
encoding_iana: str
for encoding_alias, encoding_iana in aliases.items():
if encoding_alias == specified_encoding:
return encoding_iana
if encoding_iana == specified_encoding:
return encoding_iana
return None
@lru_cache(maxsize=128)
def is_multi_byte_encoding(name: str) -> bool:
"""
Verify is a specific encoding is a multi byte one based on it IANA name
"""
return name in {
"utf_8",
"utf_8_sig",
"utf_16",
"utf_16_be",
"utf_16_le",
"utf_32",
"utf_32_le",
"utf_32_be",
"utf_7",
} or issubclass(
importlib.import_module("encodings.{}".format(name)).IncrementalDecoder,
MultibyteIncrementalDecoder,
)
def identify_sig_or_bom(sequence: bytes) -> Tuple[Optional[str], bytes]:
"""
Identify and extract SIG/BOM in given sequence.
"""
for iana_encoding in ENCODING_MARKS:
marks: Union[bytes, List[bytes]] = ENCODING_MARKS[iana_encoding]
if isinstance(marks, bytes):
marks = [marks]
for mark in marks:
if sequence.startswith(mark):
return iana_encoding, mark
return None, b""
def should_strip_sig_or_bom(iana_encoding: str) -> bool:
return iana_encoding not in {"utf_16", "utf_32"}
def iana_name(cp_name: str, strict: bool = True) -> str:
cp_name = cp_name.lower().replace("-", "_")
encoding_alias: str
encoding_iana: str
for encoding_alias, encoding_iana in aliases.items():
if cp_name in [encoding_alias, encoding_iana]:
return encoding_iana
if strict:
raise ValueError("Unable to retrieve IANA for '{}'".format(cp_name))
return cp_name
def range_scan(decoded_sequence: str) -> List[str]:
ranges: Set[str] = set()
for character in decoded_sequence:
character_range: Optional[str] = unicode_range(character)
if character_range is None:
continue
ranges.add(character_range)
return list(ranges)
def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
return 0.0
decoder_a = importlib.import_module(
"encodings.{}".format(iana_name_a)
).IncrementalDecoder
decoder_b = importlib.import_module(
"encodings.{}".format(iana_name_b)
).IncrementalDecoder
id_a: IncrementalDecoder = decoder_a(errors="ignore")
id_b: IncrementalDecoder = decoder_b(errors="ignore")
character_match_count: int = 0
for i in range(255):
to_be_decoded: bytes = bytes([i])
if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
character_match_count += 1
return character_match_count / 254
def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool:
"""
Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using
the function cp_similarity.
"""
return (
iana_name_a in IANA_SUPPORTED_SIMILAR
and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a]
)
def set_logging_handler(
name: str = "charset_normalizer",
level: int = logging.INFO,
format_string: str = "%(asctime)s | %(levelname)s | %(message)s",
) -> None:
logger = logging.getLogger(name)
logger.setLevel(level)
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter(format_string))
logger.addHandler(handler)
def cut_sequence_chunks(
sequences: bytes,
encoding_iana: str,
offsets: range,
chunk_size: int,
bom_or_sig_available: bool,
strip_sig_or_bom: bool,
sig_payload: bytes,
is_multi_byte_decoder: bool,
decoded_payload: Optional[str] = None,
) -> Generator[str, None, None]:
if decoded_payload and is_multi_byte_decoder is False:
for i in offsets:
chunk = decoded_payload[i : i + chunk_size]
if not chunk:
break
yield chunk
else:
for i in offsets:
chunk_end = i + chunk_size
if chunk_end > len(sequences) + 8:
continue
cut_sequence = sequences[i : i + chunk_size]
if bom_or_sig_available and strip_sig_or_bom is False:
cut_sequence = sig_payload + cut_sequence
chunk = cut_sequence.decode(
encoding_iana,
errors="ignore" if is_multi_byte_decoder else "strict",
)
# multi-byte bad cutting detector and adjustment
# not the cleanest way to perform that fix but clever enough for now.
if is_multi_byte_decoder and i > 0:
chunk_partial_size_chk: int = min(chunk_size, 16)
if (
decoded_payload
and chunk[:chunk_partial_size_chk] not in decoded_payload
):
for j in range(i, i - 4, -1):
cut_sequence = sequences[j:chunk_end]
if bom_or_sig_available and strip_sig_or_bom is False:
cut_sequence = sig_payload + cut_sequence
chunk = cut_sequence.decode(encoding_iana, errors="ignore")
if chunk[:chunk_partial_size_chk] in decoded_payload:
break
yield chunk

View File

@ -0,0 +1,6 @@
"""
Expose version
"""
__version__ = "3.4.0"
VERSION = __version__.split(".")

View File

@ -0,0 +1,47 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2013-2019 Gauvain Pocentek, 2019-2023 python-gitlab team
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
"""Wrapper for the GitLab API."""
import warnings
import gitlab.config # noqa: F401
from gitlab._version import ( # noqa: F401
__author__,
__copyright__,
__email__,
__license__,
__title__,
__version__,
)
from gitlab.client import Gitlab, GitlabList, GraphQL # noqa: F401
from gitlab.exceptions import * # noqa: F401,F403
warnings.filterwarnings("default", category=DeprecationWarning, module="^gitlab")
__all__ = [
"__author__",
"__copyright__",
"__email__",
"__license__",
"__title__",
"__version__",
"Gitlab",
"GitlabList",
"GraphQL",
]
__all__.extend(gitlab.exceptions.__all__)

View File

@ -0,0 +1,4 @@
import gitlab.cli
if __name__ == "__main__":
gitlab.cli.main()

View File

@ -0,0 +1,22 @@
"""
Defines http backends for processing http requests
"""
from .requests_backend import (
JobTokenAuth,
OAuthTokenAuth,
PrivateTokenAuth,
RequestsBackend,
RequestsResponse,
)
DefaultBackend = RequestsBackend
DefaultResponse = RequestsResponse
__all__ = [
"DefaultBackend",
"DefaultResponse",
"JobTokenAuth",
"OAuthTokenAuth",
"PrivateTokenAuth",
]

View File

@ -0,0 +1,24 @@
from typing import Any
import httpx
from gql.transport.httpx import HTTPXTransport
class GitlabTransport(HTTPXTransport):
"""A gql httpx transport that reuses an existing httpx.Client.
By default, gql's transports do not have a keep-alive session
and do not enable providing your own session that's kept open.
This transport lets us provide and close our session on our own
and provide additional auth.
For details, see https://github.com/graphql-python/gql/issues/91.
"""
def __init__(self, *args: Any, client: httpx.Client, **kwargs: Any):
super().__init__(*args, **kwargs)
self.client = client
def connect(self) -> None:
pass
def close(self) -> None:
pass

View File

@ -0,0 +1,32 @@
import abc
import sys
from typing import Any, Dict, Optional, Union
import requests
from requests_toolbelt.multipart.encoder import MultipartEncoder # type: ignore
if sys.version_info >= (3, 8):
from typing import Protocol
else:
from typing_extensions import Protocol
class BackendResponse(Protocol):
@abc.abstractmethod
def __init__(self, response: requests.Response) -> None: ...
class Backend(Protocol):
@abc.abstractmethod
def http_request(
self,
method: str,
url: str,
json: Optional[Union[Dict[str, Any], bytes]],
data: Optional[Union[Dict[str, Any], MultipartEncoder]],
params: Optional[Any],
timeout: Optional[float],
verify: Optional[Union[bool, str]],
stream: Optional[bool],
**kwargs: Any,
) -> BackendResponse: ...

View File

@ -0,0 +1,168 @@
from __future__ import annotations
import dataclasses
from typing import Any, BinaryIO, Dict, Optional, TYPE_CHECKING, Union
import requests
from requests import PreparedRequest
from requests.auth import AuthBase
from requests.structures import CaseInsensitiveDict
from requests_toolbelt.multipart.encoder import MultipartEncoder # type: ignore
from . import protocol
class TokenAuth:
def __init__(self, token: str):
self.token = token
class OAuthTokenAuth(TokenAuth, AuthBase):
def __call__(self, r: PreparedRequest) -> PreparedRequest:
r.headers["Authorization"] = f"Bearer {self.token}"
r.headers.pop("PRIVATE-TOKEN", None)
r.headers.pop("JOB-TOKEN", None)
return r
class PrivateTokenAuth(TokenAuth, AuthBase):
def __call__(self, r: PreparedRequest) -> PreparedRequest:
r.headers["PRIVATE-TOKEN"] = self.token
r.headers.pop("JOB-TOKEN", None)
r.headers.pop("Authorization", None)
return r
class JobTokenAuth(TokenAuth, AuthBase):
def __call__(self, r: PreparedRequest) -> PreparedRequest:
r.headers["JOB-TOKEN"] = self.token
r.headers.pop("PRIVATE-TOKEN", None)
r.headers.pop("Authorization", None)
return r
@dataclasses.dataclass
class SendData:
content_type: str
data: Optional[Union[Dict[str, Any], MultipartEncoder]] = None
json: Optional[Union[Dict[str, Any], bytes]] = None
def __post_init__(self) -> None:
if self.json is not None and self.data is not None:
raise ValueError(
f"`json` and `data` are mutually exclusive. Only one can be set. "
f"json={self.json!r} data={self.data!r}"
)
class RequestsResponse(protocol.BackendResponse):
def __init__(self, response: requests.Response) -> None:
self._response: requests.Response = response
@property
def response(self) -> requests.Response:
return self._response
@property
def status_code(self) -> int:
return self._response.status_code
@property
def headers(self) -> CaseInsensitiveDict[str]:
return self._response.headers
@property
def content(self) -> bytes:
return self._response.content
@property
def reason(self) -> str:
return self._response.reason
def json(self) -> Any:
return self._response.json()
class RequestsBackend(protocol.Backend):
def __init__(self, session: Optional[requests.Session] = None) -> None:
self._client: requests.Session = session or requests.Session()
@property
def client(self) -> requests.Session:
return self._client
@staticmethod
def prepare_send_data(
files: Optional[Dict[str, Any]] = None,
post_data: Optional[Union[Dict[str, Any], bytes, BinaryIO]] = None,
raw: bool = False,
) -> SendData:
if files:
if post_data is None:
post_data = {}
else:
# When creating a `MultipartEncoder` instance with data-types
# which don't have an `encode` method it will cause an error:
# object has no attribute 'encode'
# So convert common non-string types into strings.
if TYPE_CHECKING:
assert isinstance(post_data, dict)
for k, v in post_data.items():
if isinstance(v, bool):
v = int(v)
if isinstance(v, (complex, float, int)):
post_data[k] = str(v)
post_data["file"] = files.get("file")
post_data["avatar"] = files.get("avatar")
data = MultipartEncoder(fields=post_data)
return SendData(data=data, content_type=data.content_type)
if raw and post_data:
return SendData(data=post_data, content_type="application/octet-stream")
if TYPE_CHECKING:
assert not isinstance(post_data, BinaryIO)
return SendData(json=post_data, content_type="application/json")
def http_request(
self,
method: str,
url: str,
json: Optional[Union[Dict[str, Any], bytes]] = None,
data: Optional[Union[Dict[str, Any], MultipartEncoder]] = None,
params: Optional[Any] = None,
timeout: Optional[float] = None,
verify: Optional[Union[bool, str]] = True,
stream: Optional[bool] = False,
**kwargs: Any,
) -> RequestsResponse:
"""Make HTTP request
Args:
method: The HTTP method to call ('get', 'post', 'put', 'delete', etc.)
url: The full URL
data: The data to send to the server in the body of the request
json: Data to send in the body in json by default
timeout: The timeout, in seconds, for the request
verify: Whether SSL certificates should be validated. If
the value is a string, it is the path to a CA file used for
certificate validation.
stream: Whether the data should be streamed
Returns:
A requests Response object.
"""
response: requests.Response = self._client.request(
method=method,
url=url,
params=params,
data=data,
timeout=timeout,
stream=stream,
verify=verify,
json=json,
**kwargs,
)
return RequestsResponse(response=response)

View File

@ -0,0 +1,6 @@
__author__ = "Gauvain Pocentek, python-gitlab team"
__copyright__ = "Copyright 2013-2019 Gauvain Pocentek, 2019-2023 python-gitlab team"
__email__ = "gauvainpocentek@gmail.com"
__license__ = "LGPL3"
__title__ = "python-gitlab"
__version__ = "5.1.0"

View File

@ -0,0 +1,394 @@
import copy
import importlib
import json
import pprint
import textwrap
from types import ModuleType
from typing import Any, Dict, Iterable, Optional, Type, TYPE_CHECKING, Union
import gitlab
from gitlab import types as g_types
from gitlab.exceptions import GitlabParsingError
from .client import Gitlab, GitlabList
__all__ = [
"RESTObject",
"RESTObjectList",
"RESTManager",
]
_URL_ATTRIBUTE_ERROR = (
f"https://python-gitlab.readthedocs.io/en/v{gitlab.__version__}/"
f"faq.html#attribute-error-list"
)
class RESTObject:
"""Represents an object built from server data.
It holds the attributes know from the server, and the updated attributes in
another. This allows smart updates, if the object allows it.
You can redefine ``_id_attr`` in child classes to specify which attribute
must be used as the unique ID. ``None`` means that the object can be updated
without ID in the url.
Likewise, you can define a ``_repr_attr`` in subclasses to specify which
attribute should be added as a human-readable identifier when called in the
object's ``__repr__()`` method.
"""
_id_attr: Optional[str] = "id"
_attrs: Dict[str, Any]
_created_from_list: bool # Indicates if object was created from a list() action
_module: ModuleType
_parent_attrs: Dict[str, Any]
_repr_attr: Optional[str] = None
_updated_attrs: Dict[str, Any]
_lazy: bool
manager: "RESTManager"
def __init__(
self,
manager: "RESTManager",
attrs: Dict[str, Any],
*,
created_from_list: bool = False,
lazy: bool = False,
) -> None:
if not isinstance(attrs, dict):
raise GitlabParsingError(
f"Attempted to initialize RESTObject with a non-dictionary value: "
f"{attrs!r}\nThis likely indicates an incorrect or malformed server "
f"response."
)
self.__dict__.update(
{
"manager": manager,
"_attrs": attrs,
"_updated_attrs": {},
"_module": importlib.import_module(self.__module__),
"_created_from_list": created_from_list,
"_lazy": lazy,
}
)
self.__dict__["_parent_attrs"] = self.manager.parent_attrs
self._create_managers()
def __getstate__(self) -> Dict[str, Any]:
state = self.__dict__.copy()
module = state.pop("_module")
state["_module_name"] = module.__name__
return state
def __setstate__(self, state: Dict[str, Any]) -> None:
module_name = state.pop("_module_name")
self.__dict__.update(state)
self.__dict__["_module"] = importlib.import_module(module_name)
def __getattr__(self, name: str) -> Any:
if name in self.__dict__["_updated_attrs"]:
return self.__dict__["_updated_attrs"][name]
if name in self.__dict__["_attrs"]:
value = self.__dict__["_attrs"][name]
# If the value is a list, we copy it in the _updated_attrs dict
# because we are not able to detect changes made on the object
# (append, insert, pop, ...). Without forcing the attr
# creation __setattr__ is never called, the list never ends up
# in the _updated_attrs dict, and the update() and save()
# method never push the new data to the server.
# See https://github.com/python-gitlab/python-gitlab/issues/306
#
# note: _parent_attrs will only store simple values (int) so we
# don't make this check in the next block.
if isinstance(value, list):
self.__dict__["_updated_attrs"][name] = value[:]
return self.__dict__["_updated_attrs"][name]
return value
if name in self.__dict__["_parent_attrs"]:
return self.__dict__["_parent_attrs"][name]
message = f"{type(self).__name__!r} object has no attribute {name!r}"
if self._created_from_list:
message = (
f"{message}\n\n"
+ textwrap.fill(
f"{self.__class__!r} was created via a list() call and "
f"only a subset of the data may be present. To ensure "
f"all data is present get the object using a "
f"get(object.id) call. For more details, see:"
)
+ f"\n\n{_URL_ATTRIBUTE_ERROR}"
)
elif self._lazy:
message = f"{message}\n\n" + textwrap.fill(
f"If you tried to access object attributes returned from the server, "
f"note that {self.__class__!r} was created as a `lazy` object and was "
f"not initialized with any data."
)
raise AttributeError(message)
def __setattr__(self, name: str, value: Any) -> None:
self.__dict__["_updated_attrs"][name] = value
def asdict(self, *, with_parent_attrs: bool = False) -> Dict[str, Any]:
data = {}
if with_parent_attrs:
data.update(copy.deepcopy(self._parent_attrs))
data.update(copy.deepcopy(self._attrs))
data.update(copy.deepcopy(self._updated_attrs))
return data
@property
def attributes(self) -> Dict[str, Any]:
return self.asdict(with_parent_attrs=True)
def to_json(self, *, with_parent_attrs: bool = False, **kwargs: Any) -> str:
return json.dumps(self.asdict(with_parent_attrs=with_parent_attrs), **kwargs)
def __str__(self) -> str:
return f"{type(self)} => {self.asdict()}"
def pformat(self) -> str:
return f"{type(self)} => \n{pprint.pformat(self.asdict())}"
def pprint(self) -> None:
print(self.pformat())
def __repr__(self) -> str:
name = self.__class__.__name__
if (self._id_attr and self._repr_value) and (self._id_attr != self._repr_attr):
return (
f"<{name} {self._id_attr}:{self.get_id()} "
f"{self._repr_attr}:{self._repr_value}>"
)
if self._id_attr:
return f"<{name} {self._id_attr}:{self.get_id()}>"
if self._repr_value:
return f"<{name} {self._repr_attr}:{self._repr_value}>"
return f"<{name}>"
def __eq__(self, other: object) -> bool:
if not isinstance(other, RESTObject):
return NotImplemented
if self.get_id() and other.get_id():
return self.get_id() == other.get_id()
return super() == other
def __ne__(self, other: object) -> bool:
if not isinstance(other, RESTObject):
return NotImplemented
if self.get_id() and other.get_id():
return self.get_id() != other.get_id()
return super() != other
def __dir__(self) -> Iterable[str]:
return set(self.attributes).union(super().__dir__())
def __hash__(self) -> int:
if not self.get_id():
return super().__hash__()
return hash(self.get_id())
def _create_managers(self) -> None:
# NOTE(jlvillal): We are creating our managers by looking at the class
# annotations. If an attribute is annotated as being a *Manager type
# then we create the manager and assign it to the attribute.
for attr, annotation in sorted(self.__class__.__annotations__.items()):
# We ignore creating a manager for the 'manager' attribute as that
# is done in the self.__init__() method
if attr in ("manager",):
continue
if not isinstance(annotation, (type, str)): # pragma: no cover
continue
if isinstance(annotation, type):
cls_name = annotation.__name__
else:
cls_name = annotation
# All *Manager classes are used except for the base "RESTManager" class
if cls_name == "RESTManager" or not cls_name.endswith("Manager"):
continue
cls = getattr(self._module, cls_name)
manager = cls(self.manager.gitlab, parent=self)
# Since we have our own __setattr__ method, we can't use setattr()
self.__dict__[attr] = manager
def _update_attrs(self, new_attrs: Dict[str, Any]) -> None:
self.__dict__["_updated_attrs"] = {}
self.__dict__["_attrs"] = new_attrs
def get_id(self) -> Optional[Union[int, str]]:
"""Returns the id of the resource."""
if self._id_attr is None or not hasattr(self, self._id_attr):
return None
id_val = getattr(self, self._id_attr)
if TYPE_CHECKING:
assert id_val is None or isinstance(id_val, (int, str))
return id_val
@property
def _repr_value(self) -> Optional[str]:
"""Safely returns the human-readable resource name if present."""
if self._repr_attr is None or not hasattr(self, self._repr_attr):
return None
repr_val = getattr(self, self._repr_attr)
if TYPE_CHECKING:
assert isinstance(repr_val, str)
return repr_val
@property
def encoded_id(self) -> Optional[Union[int, str]]:
"""Ensure that the ID is url-encoded so that it can be safely used in a URL
path"""
obj_id = self.get_id()
if isinstance(obj_id, str):
obj_id = gitlab.utils.EncodedId(obj_id)
return obj_id
class RESTObjectList:
"""Generator object representing a list of RESTObject's.
This generator uses the Gitlab pagination system to fetch new data when
required.
Note: you should not instantiate such objects, they are returned by calls
to RESTManager.list()
Args:
manager: Manager to attach to the created objects
obj_cls: Type of objects to create from the json data
_list: A GitlabList object
"""
def __init__(
self, manager: "RESTManager", obj_cls: Type[RESTObject], _list: GitlabList
) -> None:
"""Creates an objects list from a GitlabList.
You should not create objects of this type, but use managers list()
methods instead.
Args:
manager: the RESTManager to attach to the objects
obj_cls: the class of the created objects
_list: the GitlabList holding the data
"""
self.manager = manager
self._obj_cls = obj_cls
self._list = _list
def __iter__(self) -> "RESTObjectList":
return self
def __len__(self) -> int:
return len(self._list)
def __next__(self) -> RESTObject:
return self.next()
def next(self) -> RESTObject:
data = self._list.next()
return self._obj_cls(self.manager, data, created_from_list=True)
@property
def current_page(self) -> int:
"""The current page number."""
return self._list.current_page
@property
def prev_page(self) -> Optional[int]:
"""The previous page number.
If None, the current page is the first.
"""
return self._list.prev_page
@property
def next_page(self) -> Optional[int]:
"""The next page number.
If None, the current page is the last.
"""
return self._list.next_page
@property
def per_page(self) -> Optional[int]:
"""The number of items per page."""
return self._list.per_page
@property
def total_pages(self) -> Optional[int]:
"""The total number of pages."""
return self._list.total_pages
@property
def total(self) -> Optional[int]:
"""The total number of items."""
return self._list.total
class RESTManager:
"""Base class for CRUD operations on objects.
Derived class must define ``_path`` and ``_obj_cls``.
``_path``: Base URL path on which requests will be sent (e.g. '/projects')
``_obj_cls``: The class of objects that will be created
"""
_create_attrs: g_types.RequiredOptional = g_types.RequiredOptional()
_update_attrs: g_types.RequiredOptional = g_types.RequiredOptional()
_path: Optional[str] = None
_obj_cls: Optional[Type[RESTObject]] = None
_from_parent_attrs: Dict[str, Any] = {}
_types: Dict[str, Type[g_types.GitlabAttribute]] = {}
_computed_path: Optional[str]
_parent: Optional[RESTObject]
_parent_attrs: Dict[str, Any]
gitlab: Gitlab
def __init__(self, gl: Gitlab, parent: Optional[RESTObject] = None) -> None:
"""REST manager constructor.
Args:
gl: :class:`~gitlab.Gitlab` connection to use to make requests.
parent: REST object to which the manager is attached.
"""
self.gitlab = gl
self._parent = parent # for nested managers
self._computed_path = self._compute_path()
@property
def parent_attrs(self) -> Optional[Dict[str, Any]]:
return self._parent_attrs
def _compute_path(self, path: Optional[str] = None) -> Optional[str]:
self._parent_attrs = {}
if path is None:
path = self._path
if path is None:
return None
if self._parent is None or not self._from_parent_attrs:
return path
data: Dict[str, Optional[gitlab.utils.EncodedId]] = {}
for self_attr, parent_attr in self._from_parent_attrs.items():
if not hasattr(self._parent, parent_attr):
data[self_attr] = None
continue
data[self_attr] = gitlab.utils.EncodedId(getattr(self._parent, parent_attr))
self._parent_attrs = data
return path.format(**data)
@property
def path(self) -> Optional[str]:
return self._computed_path

View File

@ -0,0 +1,420 @@
import argparse
import dataclasses
import functools
import os
import pathlib
import re
import sys
from types import ModuleType
from typing import (
Any,
Callable,
cast,
Dict,
NoReturn,
Optional,
Tuple,
Type,
TYPE_CHECKING,
TypeVar,
Union,
)
from requests.structures import CaseInsensitiveDict
import gitlab.config
from gitlab.base import RESTObject
# This regex is based on:
# https://github.com/jpvanhal/inflection/blob/master/inflection/__init__.py
camel_upperlower_regex = re.compile(r"([A-Z]+)([A-Z][a-z])")
camel_lowerupper_regex = re.compile(r"([a-z\d])([A-Z])")
@dataclasses.dataclass
class CustomAction:
required: Tuple[str, ...]
optional: Tuple[str, ...]
in_object: bool
requires_id: bool # if the `_id_attr` value should be a required argument
help: Optional[str] # help text for the custom action
# custom_actions = {
# cls: {
# action: CustomAction,
# },
# }
custom_actions: Dict[str, Dict[str, CustomAction]] = {}
# For an explanation of how these type-hints work see:
# https://mypy.readthedocs.io/en/stable/generics.html#declaring-decorators
#
# The goal here is that functions which get decorated will retain their types.
__F = TypeVar("__F", bound=Callable[..., Any])
def register_custom_action(
*,
cls_names: Union[str, Tuple[str, ...]],
required: Tuple[str, ...] = (),
optional: Tuple[str, ...] = (),
custom_action: Optional[str] = None,
requires_id: bool = True, # if the `_id_attr` value should be a required argument
help: Optional[str] = None, # help text for the action
) -> Callable[[__F], __F]:
def wrap(f: __F) -> __F:
@functools.wraps(f)
def wrapped_f(*args: Any, **kwargs: Any) -> Any:
return f(*args, **kwargs)
# in_obj defines whether the method belongs to the obj or the manager
in_obj = True
if isinstance(cls_names, tuple):
classes = cls_names
else:
classes = (cls_names,)
for cls_name in classes:
final_name = cls_name
if cls_name.endswith("Manager"):
final_name = cls_name.replace("Manager", "")
in_obj = False
if final_name not in custom_actions:
custom_actions[final_name] = {}
action = custom_action or f.__name__.replace("_", "-")
custom_actions[final_name][action] = CustomAction(
required=required,
optional=optional,
in_object=in_obj,
requires_id=requires_id,
help=help,
)
return cast(__F, wrapped_f)
return wrap
def die(msg: str, e: Optional[Exception] = None) -> NoReturn:
if e:
msg = f"{msg} ({e})"
sys.stderr.write(f"{msg}\n")
sys.exit(1)
def gitlab_resource_to_cls(
gitlab_resource: str, namespace: ModuleType
) -> Type[RESTObject]:
classes = CaseInsensitiveDict(namespace.__dict__)
lowercase_class = gitlab_resource.replace("-", "")
class_type = classes[lowercase_class]
if TYPE_CHECKING:
assert isinstance(class_type, type)
assert issubclass(class_type, RESTObject)
return class_type
def cls_to_gitlab_resource(cls: RESTObject) -> str:
dasherized_uppercase = camel_upperlower_regex.sub(r"\1-\2", cls.__name__)
dasherized_lowercase = camel_lowerupper_regex.sub(r"\1-\2", dasherized_uppercase)
return dasherized_lowercase.lower()
def _get_base_parser(add_help: bool = True) -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
add_help=add_help,
description="GitLab API Command Line Interface",
allow_abbrev=False,
)
parser.add_argument("--version", help="Display the version.", action="store_true")
parser.add_argument(
"-v",
"--verbose",
"--fancy",
help="Verbose mode (legacy format only) [env var: GITLAB_VERBOSE]",
action="store_true",
default=os.getenv("GITLAB_VERBOSE"),
)
parser.add_argument(
"-d",
"--debug",
help="Debug mode (display HTTP requests) [env var: GITLAB_DEBUG]",
action="store_true",
default=os.getenv("GITLAB_DEBUG"),
)
parser.add_argument(
"-c",
"--config-file",
action="append",
help=(
"Configuration file to use. Can be used multiple times. "
"[env var: PYTHON_GITLAB_CFG]"
),
)
parser.add_argument(
"-g",
"--gitlab",
help=(
"Which configuration section should "
"be used. If not defined, the default selection "
"will be used."
),
required=False,
)
parser.add_argument(
"-o",
"--output",
help="Output format (v4 only): json|legacy|yaml",
required=False,
choices=["json", "legacy", "yaml"],
default="legacy",
)
parser.add_argument(
"-f",
"--fields",
help=(
"Fields to display in the output (comma "
"separated). Not used with legacy output"
),
required=False,
)
parser.add_argument(
"--server-url",
help=("GitLab server URL [env var: GITLAB_URL]"),
required=False,
default=os.getenv("GITLAB_URL"),
)
ssl_verify_group = parser.add_mutually_exclusive_group()
ssl_verify_group.add_argument(
"--ssl-verify",
help=(
"Path to a CA_BUNDLE file or directory with certificates of trusted CAs. "
"[env var: GITLAB_SSL_VERIFY]"
),
required=False,
default=os.getenv("GITLAB_SSL_VERIFY"),
)
ssl_verify_group.add_argument(
"--no-ssl-verify",
help="Disable SSL verification",
required=False,
dest="ssl_verify",
action="store_false",
)
parser.add_argument(
"--timeout",
help=(
"Timeout to use for requests to the GitLab server. "
"[env var: GITLAB_TIMEOUT]"
),
required=False,
type=int,
default=os.getenv("GITLAB_TIMEOUT"),
)
parser.add_argument(
"--api-version",
help=("GitLab API version [env var: GITLAB_API_VERSION]"),
required=False,
default=os.getenv("GITLAB_API_VERSION"),
)
parser.add_argument(
"--per-page",
help=(
"Number of entries to return per page in the response. "
"[env var: GITLAB_PER_PAGE]"
),
required=False,
type=int,
default=os.getenv("GITLAB_PER_PAGE"),
)
parser.add_argument(
"--pagination",
help=(
"Whether to use keyset or offset pagination [env var: GITLAB_PAGINATION]"
),
required=False,
default=os.getenv("GITLAB_PAGINATION"),
)
parser.add_argument(
"--order-by",
help=("Set order_by globally [env var: GITLAB_ORDER_BY]"),
required=False,
default=os.getenv("GITLAB_ORDER_BY"),
)
parser.add_argument(
"--user-agent",
help=(
"The user agent to send to GitLab with the HTTP request. "
"[env var: GITLAB_USER_AGENT]"
),
required=False,
default=os.getenv("GITLAB_USER_AGENT"),
)
tokens = parser.add_mutually_exclusive_group()
tokens.add_argument(
"--private-token",
help=("GitLab private access token [env var: GITLAB_PRIVATE_TOKEN]"),
required=False,
default=os.getenv("GITLAB_PRIVATE_TOKEN"),
)
tokens.add_argument(
"--oauth-token",
help=("GitLab OAuth token [env var: GITLAB_OAUTH_TOKEN]"),
required=False,
default=os.getenv("GITLAB_OAUTH_TOKEN"),
)
tokens.add_argument(
"--job-token",
help=("GitLab CI job token [env var: CI_JOB_TOKEN]"),
required=False,
)
parser.add_argument(
"--skip-login",
help=(
"Skip initial authenticated API call to the current user endpoint. "
"This may be useful when invoking the CLI in scripts. "
"[env var: GITLAB_SKIP_LOGIN]"
),
action="store_true",
default=os.getenv("GITLAB_SKIP_LOGIN"),
)
parser.add_argument(
"--no-mask-credentials",
help="Don't mask credentials in debug mode",
dest="mask_credentials",
action="store_false",
)
return parser
def _get_parser() -> argparse.ArgumentParser:
# NOTE: We must delay import of gitlab.v4.cli until now or
# otherwise it will cause circular import errors
from gitlab.v4 import cli as v4_cli
parser = _get_base_parser()
return v4_cli.extend_parser(parser)
def _parse_value(v: Any) -> Any:
if isinstance(v, str) and v.startswith("@@"):
return v[1:]
if isinstance(v, str) and v.startswith("@"):
# If the user-provided value starts with @, we try to read the file
# path provided after @ as the real value.
filepath = pathlib.Path(v[1:]).expanduser().resolve()
try:
with open(filepath, encoding="utf-8") as f:
return f.read()
except UnicodeDecodeError:
with open(filepath, "rb") as f:
return f.read()
except OSError as exc:
exc_name = type(exc).__name__
sys.stderr.write(f"{exc_name}: {exc}\n")
sys.exit(1)
return v
def docs() -> argparse.ArgumentParser: # pragma: no cover
"""
Provide a statically generated parser for sphinx only, so we don't need
to provide dummy gitlab config for readthedocs.
"""
if "sphinx" not in sys.modules:
sys.exit("Docs parser is only intended for build_sphinx")
return _get_parser()
def main() -> None:
if "--version" in sys.argv:
print(gitlab.__version__)
sys.exit(0)
parser = _get_base_parser(add_help=False)
# This first parsing step is used to find the gitlab config to use, and
# load the propermodule (v3 or v4) accordingly. At that point we don't have
# any subparser setup
(options, _) = parser.parse_known_args(sys.argv)
try:
config = gitlab.config.GitlabConfigParser(options.gitlab, options.config_file)
except gitlab.config.ConfigError as e:
if "--help" in sys.argv or "-h" in sys.argv:
parser.print_help()
sys.exit(0)
sys.exit(str(e))
# We only support v4 API at this time
if config.api_version not in ("4",): # dead code # pragma: no cover
raise ModuleNotFoundError(f"gitlab.v{config.api_version}.cli")
# Now we build the entire set of subcommands and do the complete parsing
parser = _get_parser()
try:
import argcomplete # type: ignore
argcomplete.autocomplete(parser) # pragma: no cover
except Exception:
pass
args = parser.parse_args()
config_files = args.config_file
gitlab_id = args.gitlab
verbose = args.verbose
output = args.output
fields = []
if args.fields:
fields = [x.strip() for x in args.fields.split(",")]
debug = args.debug
gitlab_resource = args.gitlab_resource
resource_action = args.resource_action
skip_login = args.skip_login
mask_credentials = args.mask_credentials
args_dict = vars(args)
# Remove CLI behavior-related args
for item in (
"api_version",
"config_file",
"debug",
"fields",
"gitlab",
"gitlab_resource",
"job_token",
"mask_credentials",
"oauth_token",
"output",
"pagination",
"private_token",
"resource_action",
"server_url",
"skip_login",
"ssl_verify",
"timeout",
"user_agent",
"verbose",
"version",
):
args_dict.pop(item)
args_dict = {k: _parse_value(v) for k, v in args_dict.items() if v is not None}
try:
gl = gitlab.Gitlab.merge_config(vars(options), gitlab_id, config_files)
if debug:
gl.enable_debug(mask_credentials=mask_credentials)
if not skip_login and (gl.private_token or gl.oauth_token):
gl.auth()
except Exception as e:
die(str(e))
gitlab.v4.cli.run(
gl, gitlab_resource, resource_action, args_dict, verbose, output, fields
)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,287 @@
import configparser
import os
import shlex
import subprocess
from os.path import expanduser, expandvars
from pathlib import Path
from typing import List, Optional, Union
from gitlab.const import USER_AGENT
_DEFAULT_FILES: List[str] = [
"/etc/python-gitlab.cfg",
str(Path.home() / ".python-gitlab.cfg"),
]
HELPER_PREFIX = "helper:"
HELPER_ATTRIBUTES = ["job_token", "http_password", "private_token", "oauth_token"]
_CONFIG_PARSER_ERRORS = (configparser.NoOptionError, configparser.NoSectionError)
def _resolve_file(filepath: Union[Path, str]) -> str:
resolved = Path(filepath).resolve(strict=True)
return str(resolved)
def _get_config_files(
config_files: Optional[List[str]] = None,
) -> Union[str, List[str]]:
"""
Return resolved path(s) to config files if they exist, with precedence:
1. Files passed in config_files
2. File defined in PYTHON_GITLAB_CFG
3. User- and system-wide config files
"""
resolved_files = []
if config_files:
for config_file in config_files:
try:
resolved = _resolve_file(config_file)
except OSError as e:
raise GitlabConfigMissingError(
f"Cannot read config from file: {e}"
) from e
resolved_files.append(resolved)
return resolved_files
try:
env_config = os.environ["PYTHON_GITLAB_CFG"]
return _resolve_file(env_config)
except KeyError:
pass
except OSError as e:
raise GitlabConfigMissingError(
f"Cannot read config from PYTHON_GITLAB_CFG: {e}"
) from e
for config_file in _DEFAULT_FILES:
try:
resolved = _resolve_file(config_file)
except OSError:
continue
resolved_files.append(resolved)
return resolved_files
class ConfigError(Exception):
pass
class GitlabIDError(ConfigError):
pass
class GitlabDataError(ConfigError):
pass
class GitlabConfigMissingError(ConfigError):
pass
class GitlabConfigHelperError(ConfigError):
pass
class GitlabConfigParser:
def __init__(
self, gitlab_id: Optional[str] = None, config_files: Optional[List[str]] = None
) -> None:
self.gitlab_id = gitlab_id
self.http_username: Optional[str] = None
self.http_password: Optional[str] = None
self.job_token: Optional[str] = None
self.oauth_token: Optional[str] = None
self.private_token: Optional[str] = None
self.api_version: str = "4"
self.order_by: Optional[str] = None
self.pagination: Optional[str] = None
self.per_page: Optional[int] = None
self.retry_transient_errors: bool = False
self.ssl_verify: Union[bool, str] = True
self.timeout: int = 60
self.url: Optional[str] = None
self.user_agent: str = USER_AGENT
self.keep_base_url: bool = False
self._files = _get_config_files(config_files)
if self._files:
self._parse_config()
if self.gitlab_id and not self._files:
raise GitlabConfigMissingError(
f"A gitlab id was provided ({self.gitlab_id}) but no config file found"
)
def _parse_config(self) -> None:
_config = configparser.ConfigParser()
_config.read(self._files, encoding="utf-8")
if self.gitlab_id and not _config.has_section(self.gitlab_id):
raise GitlabDataError(
f"A gitlab id was provided ({self.gitlab_id}) "
"but no config section found"
)
if self.gitlab_id is None:
try:
self.gitlab_id = _config.get("global", "default")
except Exception as e:
raise GitlabIDError(
"Impossible to get the gitlab id (not specified in config file)"
) from e
try:
self.url = _config.get(self.gitlab_id, "url")
except Exception as e:
raise GitlabDataError(
"Impossible to get gitlab details from "
f"configuration ({self.gitlab_id})"
) from e
try:
self.ssl_verify = _config.getboolean("global", "ssl_verify")
except ValueError:
# Value Error means the option exists but isn't a boolean.
# Get as a string instead as it should then be a local path to a
# CA bundle.
self.ssl_verify = _config.get("global", "ssl_verify")
except _CONFIG_PARSER_ERRORS:
pass
try:
self.ssl_verify = _config.getboolean(self.gitlab_id, "ssl_verify")
except ValueError:
# Value Error means the option exists but isn't a boolean.
# Get as a string instead as it should then be a local path to a
# CA bundle.
self.ssl_verify = _config.get(self.gitlab_id, "ssl_verify")
except _CONFIG_PARSER_ERRORS:
pass
try:
self.timeout = _config.getint("global", "timeout")
except _CONFIG_PARSER_ERRORS:
pass
try:
self.timeout = _config.getint(self.gitlab_id, "timeout")
except _CONFIG_PARSER_ERRORS:
pass
try:
self.private_token = _config.get(self.gitlab_id, "private_token")
except _CONFIG_PARSER_ERRORS:
pass
try:
self.oauth_token = _config.get(self.gitlab_id, "oauth_token")
except _CONFIG_PARSER_ERRORS:
pass
try:
self.job_token = _config.get(self.gitlab_id, "job_token")
except _CONFIG_PARSER_ERRORS:
pass
try:
self.http_username = _config.get(self.gitlab_id, "http_username")
self.http_password = _config.get(
self.gitlab_id, "http_password"
) # pragma: no cover
except _CONFIG_PARSER_ERRORS:
pass
self._get_values_from_helper()
try:
self.api_version = _config.get("global", "api_version")
except _CONFIG_PARSER_ERRORS:
pass
try:
self.api_version = _config.get(self.gitlab_id, "api_version")
except _CONFIG_PARSER_ERRORS:
pass
if self.api_version not in ("4",):
raise GitlabDataError(f"Unsupported API version: {self.api_version}")
for section in ["global", self.gitlab_id]:
try:
self.per_page = _config.getint(section, "per_page")
except _CONFIG_PARSER_ERRORS:
pass
if self.per_page is not None and not 0 <= self.per_page <= 100:
raise GitlabDataError(f"Unsupported per_page number: {self.per_page}")
try:
self.pagination = _config.get(self.gitlab_id, "pagination")
except _CONFIG_PARSER_ERRORS:
pass
try:
self.order_by = _config.get(self.gitlab_id, "order_by")
except _CONFIG_PARSER_ERRORS:
pass
try:
self.user_agent = _config.get("global", "user_agent")
except _CONFIG_PARSER_ERRORS:
pass
try:
self.user_agent = _config.get(self.gitlab_id, "user_agent")
except _CONFIG_PARSER_ERRORS:
pass
try:
self.keep_base_url = _config.getboolean("global", "keep_base_url")
except _CONFIG_PARSER_ERRORS:
pass
try:
self.keep_base_url = _config.getboolean(self.gitlab_id, "keep_base_url")
except _CONFIG_PARSER_ERRORS:
pass
try:
self.retry_transient_errors = _config.getboolean(
"global", "retry_transient_errors"
)
except _CONFIG_PARSER_ERRORS:
pass
try:
self.retry_transient_errors = _config.getboolean(
self.gitlab_id, "retry_transient_errors"
)
except _CONFIG_PARSER_ERRORS:
pass
def _get_values_from_helper(self) -> None:
"""Update attributes that may get values from an external helper program"""
for attr in HELPER_ATTRIBUTES:
value = getattr(self, attr)
if not isinstance(value, str):
continue
if not value.lower().strip().startswith(HELPER_PREFIX):
continue
helper = value[len(HELPER_PREFIX) :].strip()
commmand = [expanduser(expandvars(token)) for token in shlex.split(helper)]
try:
value = (
subprocess.check_output(commmand, stderr=subprocess.PIPE)
.decode("utf-8")
.strip()
)
except subprocess.CalledProcessError as e:
stderr = e.stderr.decode().strip()
raise GitlabConfigHelperError(
f"Failed to read {attr} value from helper "
f"for {self.gitlab_id}:\n{stderr}"
) from e
setattr(self, attr, value)

View File

@ -0,0 +1,169 @@
from enum import Enum, IntEnum
from gitlab._version import __title__, __version__
class GitlabEnum(str, Enum):
"""An enum mixed in with str to make it JSON-serializable."""
# https://gitlab.com/gitlab-org/gitlab/-/blob/e97357824bedf007e75f8782259fe07435b64fbb/lib/gitlab/access.rb#L12-18
class AccessLevel(IntEnum):
NO_ACCESS: int = 0
MINIMAL_ACCESS: int = 5
GUEST: int = 10
PLANNER: int = 15
REPORTER: int = 20
DEVELOPER: int = 30
MAINTAINER: int = 40
OWNER: int = 50
ADMIN: int = 60
# https://gitlab.com/gitlab-org/gitlab/-/blob/e97357824bedf007e75f8782259fe07435b64fbb/lib/gitlab/visibility_level.rb#L23-25
class Visibility(GitlabEnum):
PRIVATE: str = "private"
INTERNAL: str = "internal"
PUBLIC: str = "public"
class NotificationLevel(GitlabEnum):
DISABLED: str = "disabled"
PARTICIPATING: str = "participating"
WATCH: str = "watch"
GLOBAL: str = "global"
MENTION: str = "mention"
CUSTOM: str = "custom"
# https://gitlab.com/gitlab-org/gitlab/-/blob/e97357824bedf007e75f8782259fe07435b64fbb/app/views/search/_category.html.haml#L10-37
class SearchScope(GitlabEnum):
# all scopes (global, group and project)
PROJECTS: str = "projects"
ISSUES: str = "issues"
MERGE_REQUESTS: str = "merge_requests"
MILESTONES: str = "milestones"
WIKI_BLOBS: str = "wiki_blobs"
COMMITS: str = "commits"
BLOBS: str = "blobs"
USERS: str = "users"
# specific global scope
GLOBAL_SNIPPET_TITLES: str = "snippet_titles"
# specific project scope
PROJECT_NOTES: str = "notes"
# https://docs.gitlab.com/ee/api/merge_requests.html#merge-status
class DetailedMergeStatus(GitlabEnum):
# possible values for the detailed_merge_status field of Merge Requests
BLOCKED_STATUS: str = "blocked_status"
BROKEN_STATUS: str = "broken_status"
CHECKING: str = "checking"
UNCHECKED: str = "unchecked"
CI_MUST_PASS: str = "ci_must_pass"
CI_STILL_RUNNING: str = "ci_still_running"
DISCUSSIONS_NOT_RESOLVED: str = "discussions_not_resolved"
DRAFT_STATUS: str = "draft_status"
EXTERNAL_STATUS_CHECKS: str = "external_status_checks"
MERGEABLE: str = "mergeable"
NOT_APPROVED: str = "not_approved"
NOT_OPEN: str = "not_open"
POLICIES_DENIED: str = "policies_denied"
# https://docs.gitlab.com/ee/api/pipelines.html
class PipelineStatus(GitlabEnum):
CREATED: str = "created"
WAITING_FOR_RESOURCE: str = "waiting_for_resource"
PREPARING: str = "preparing"
PENDING: str = "pending"
RUNNING: str = "running"
SUCCESS: str = "success"
FAILED: str = "failed"
CANCELED: str = "canceled"
SKIPPED: str = "skipped"
MANUAL: str = "manual"
SCHEDULED: str = "scheduled"
DEFAULT_URL: str = "https://gitlab.com"
NO_ACCESS = AccessLevel.NO_ACCESS.value
MINIMAL_ACCESS = AccessLevel.MINIMAL_ACCESS.value
GUEST_ACCESS = AccessLevel.GUEST.value
REPORTER_ACCESS = AccessLevel.REPORTER.value
DEVELOPER_ACCESS = AccessLevel.DEVELOPER.value
MAINTAINER_ACCESS = AccessLevel.MAINTAINER.value
OWNER_ACCESS = AccessLevel.OWNER.value
ADMIN_ACCESS = AccessLevel.ADMIN.value
VISIBILITY_PRIVATE = Visibility.PRIVATE.value
VISIBILITY_INTERNAL = Visibility.INTERNAL.value
VISIBILITY_PUBLIC = Visibility.PUBLIC.value
NOTIFICATION_LEVEL_DISABLED = NotificationLevel.DISABLED.value
NOTIFICATION_LEVEL_PARTICIPATING = NotificationLevel.PARTICIPATING.value
NOTIFICATION_LEVEL_WATCH = NotificationLevel.WATCH.value
NOTIFICATION_LEVEL_GLOBAL = NotificationLevel.GLOBAL.value
NOTIFICATION_LEVEL_MENTION = NotificationLevel.MENTION.value
NOTIFICATION_LEVEL_CUSTOM = NotificationLevel.CUSTOM.value
# Search scopes
# all scopes (global, group and project)
SEARCH_SCOPE_PROJECTS = SearchScope.PROJECTS.value
SEARCH_SCOPE_ISSUES = SearchScope.ISSUES.value
SEARCH_SCOPE_MERGE_REQUESTS = SearchScope.MERGE_REQUESTS.value
SEARCH_SCOPE_MILESTONES = SearchScope.MILESTONES.value
SEARCH_SCOPE_WIKI_BLOBS = SearchScope.WIKI_BLOBS.value
SEARCH_SCOPE_COMMITS = SearchScope.COMMITS.value
SEARCH_SCOPE_BLOBS = SearchScope.BLOBS.value
SEARCH_SCOPE_USERS = SearchScope.USERS.value
# specific global scope
SEARCH_SCOPE_GLOBAL_SNIPPET_TITLES = SearchScope.GLOBAL_SNIPPET_TITLES.value
# specific project scope
SEARCH_SCOPE_PROJECT_NOTES = SearchScope.PROJECT_NOTES.value
USER_AGENT: str = f"{__title__}/{__version__}"
NO_JSON_RESPONSE_CODES = [204]
RETRYABLE_TRANSIENT_ERROR_CODES = [500, 502, 503, 504] + list(range(520, 531))
__all__ = [
"AccessLevel",
"Visibility",
"NotificationLevel",
"SearchScope",
"ADMIN_ACCESS",
"DEFAULT_URL",
"DEVELOPER_ACCESS",
"GUEST_ACCESS",
"MAINTAINER_ACCESS",
"MINIMAL_ACCESS",
"NO_ACCESS",
"NOTIFICATION_LEVEL_CUSTOM",
"NOTIFICATION_LEVEL_DISABLED",
"NOTIFICATION_LEVEL_GLOBAL",
"NOTIFICATION_LEVEL_MENTION",
"NOTIFICATION_LEVEL_PARTICIPATING",
"NOTIFICATION_LEVEL_WATCH",
"OWNER_ACCESS",
"REPORTER_ACCESS",
"SEARCH_SCOPE_BLOBS",
"SEARCH_SCOPE_COMMITS",
"SEARCH_SCOPE_GLOBAL_SNIPPET_TITLES",
"SEARCH_SCOPE_ISSUES",
"SEARCH_SCOPE_MERGE_REQUESTS",
"SEARCH_SCOPE_MILESTONES",
"SEARCH_SCOPE_PROJECT_NOTES",
"SEARCH_SCOPE_PROJECTS",
"SEARCH_SCOPE_USERS",
"SEARCH_SCOPE_WIKI_BLOBS",
"USER_AGENT",
"VISIBILITY_INTERNAL",
"VISIBILITY_PRIVATE",
"VISIBILITY_PUBLIC",
]

View File

@ -0,0 +1,428 @@
import functools
from typing import Any, Callable, cast, Optional, Type, TYPE_CHECKING, TypeVar, Union
class GitlabError(Exception):
def __init__(
self,
error_message: Union[str, bytes] = "",
response_code: Optional[int] = None,
response_body: Optional[bytes] = None,
) -> None:
Exception.__init__(self, error_message)
# Http status code
self.response_code = response_code
# Full http response
self.response_body = response_body
# Parsed error message from gitlab
try:
# if we receive str/bytes we try to convert to unicode/str to have
# consistent message types (see #616)
if TYPE_CHECKING:
assert isinstance(error_message, bytes)
self.error_message = error_message.decode()
except Exception:
if TYPE_CHECKING:
assert isinstance(error_message, str)
self.error_message = error_message
def __str__(self) -> str:
if self.response_code is not None:
return f"{self.response_code}: {self.error_message}"
return f"{self.error_message}"
class GitlabAuthenticationError(GitlabError):
pass
class RedirectError(GitlabError):
pass
class GitlabParsingError(GitlabError):
pass
class GitlabCiLintError(GitlabError):
pass
class GitlabConnectionError(GitlabError):
pass
class GitlabOperationError(GitlabError):
pass
class GitlabHttpError(GitlabError):
pass
class GitlabListError(GitlabOperationError):
pass
class GitlabGetError(GitlabOperationError):
pass
class GitlabHeadError(GitlabOperationError):
pass
class GitlabCreateError(GitlabOperationError):
pass
class GitlabUpdateError(GitlabOperationError):
pass
class GitlabDeleteError(GitlabOperationError):
pass
class GitlabSetError(GitlabOperationError):
pass
class GitlabProtectError(GitlabOperationError):
pass
class GitlabTransferProjectError(GitlabOperationError):
pass
class GitlabGroupTransferError(GitlabOperationError):
pass
class GitlabProjectDeployKeyError(GitlabOperationError):
pass
class GitlabPromoteError(GitlabOperationError):
pass
class GitlabCancelError(GitlabOperationError):
pass
class GitlabPipelineCancelError(GitlabCancelError):
pass
class GitlabRetryError(GitlabOperationError):
pass
class GitlabBuildCancelError(GitlabCancelError):
pass
class GitlabBuildRetryError(GitlabRetryError):
pass
class GitlabBuildPlayError(GitlabRetryError):
pass
class GitlabBuildEraseError(GitlabRetryError):
pass
class GitlabJobCancelError(GitlabCancelError):
pass
class GitlabJobRetryError(GitlabRetryError):
pass
class GitlabJobPlayError(GitlabRetryError):
pass
class GitlabJobEraseError(GitlabRetryError):
pass
class GitlabPipelinePlayError(GitlabRetryError):
pass
class GitlabPipelineRetryError(GitlabRetryError):
pass
class GitlabBlockError(GitlabOperationError):
pass
class GitlabUnblockError(GitlabOperationError):
pass
class GitlabDeactivateError(GitlabOperationError):
pass
class GitlabActivateError(GitlabOperationError):
pass
class GitlabBanError(GitlabOperationError):
pass
class GitlabUnbanError(GitlabOperationError):
pass
class GitlabSubscribeError(GitlabOperationError):
pass
class GitlabUnsubscribeError(GitlabOperationError):
pass
class GitlabMRForbiddenError(GitlabOperationError):
pass
class GitlabMRApprovalError(GitlabOperationError):
pass
class GitlabMRRebaseError(GitlabOperationError):
pass
class GitlabMRResetApprovalError(GitlabOperationError):
pass
class GitlabMRClosedError(GitlabOperationError):
pass
class GitlabMROnBuildSuccessError(GitlabOperationError):
pass
class GitlabTodoError(GitlabOperationError):
pass
class GitlabTopicMergeError(GitlabOperationError):
pass
class GitlabTimeTrackingError(GitlabOperationError):
pass
class GitlabUploadError(GitlabOperationError):
pass
class GitlabAttachFileError(GitlabOperationError):
pass
class GitlabImportError(GitlabOperationError):
pass
class GitlabInvitationError(GitlabOperationError):
pass
class GitlabCherryPickError(GitlabOperationError):
pass
class GitlabHousekeepingError(GitlabOperationError):
pass
class GitlabOwnershipError(GitlabOperationError):
pass
class GitlabSearchError(GitlabOperationError):
pass
class GitlabStopError(GitlabOperationError):
pass
class GitlabMarkdownError(GitlabOperationError):
pass
class GitlabVerifyError(GitlabOperationError):
pass
class GitlabRenderError(GitlabOperationError):
pass
class GitlabRepairError(GitlabOperationError):
pass
class GitlabRestoreError(GitlabOperationError):
pass
class GitlabRevertError(GitlabOperationError):
pass
class GitlabRotateError(GitlabOperationError):
pass
class GitlabLicenseError(GitlabOperationError):
pass
class GitlabFollowError(GitlabOperationError):
pass
class GitlabUnfollowError(GitlabOperationError):
pass
class GitlabUserApproveError(GitlabOperationError):
pass
class GitlabUserRejectError(GitlabOperationError):
pass
class GitlabDeploymentApprovalError(GitlabOperationError):
pass
class GitlabHookTestError(GitlabOperationError):
pass
# For an explanation of how these type-hints work see:
# https://mypy.readthedocs.io/en/stable/generics.html#declaring-decorators
#
# The goal here is that functions which get decorated will retain their types.
__F = TypeVar("__F", bound=Callable[..., Any])
def on_http_error(error: Type[Exception]) -> Callable[[__F], __F]:
"""Manage GitlabHttpError exceptions.
This decorator function can be used to catch GitlabHttpError exceptions
raise specialized exceptions instead.
Args:
The exception type to raise -- must inherit from GitlabError
"""
def wrap(f: __F) -> __F:
@functools.wraps(f)
def wrapped_f(*args: Any, **kwargs: Any) -> Any:
try:
return f(*args, **kwargs)
except GitlabHttpError as e:
raise error(e.error_message, e.response_code, e.response_body) from e
return cast(__F, wrapped_f)
return wrap
# Export manually to keep mypy happy
__all__ = [
"GitlabActivateError",
"GitlabAttachFileError",
"GitlabAuthenticationError",
"GitlabBanError",
"GitlabBlockError",
"GitlabBuildCancelError",
"GitlabBuildEraseError",
"GitlabBuildPlayError",
"GitlabBuildRetryError",
"GitlabCancelError",
"GitlabCherryPickError",
"GitlabCiLintError",
"GitlabConnectionError",
"GitlabCreateError",
"GitlabDeactivateError",
"GitlabDeleteError",
"GitlabDeploymentApprovalError",
"GitlabError",
"GitlabFollowError",
"GitlabGetError",
"GitlabGroupTransferError",
"GitlabHeadError",
"GitlabHookTestError",
"GitlabHousekeepingError",
"GitlabHttpError",
"GitlabImportError",
"GitlabInvitationError",
"GitlabJobCancelError",
"GitlabJobEraseError",
"GitlabJobPlayError",
"GitlabJobRetryError",
"GitlabLicenseError",
"GitlabListError",
"GitlabMRApprovalError",
"GitlabMRClosedError",
"GitlabMRForbiddenError",
"GitlabMROnBuildSuccessError",
"GitlabMRRebaseError",
"GitlabMRResetApprovalError",
"GitlabMarkdownError",
"GitlabOperationError",
"GitlabOwnershipError",
"GitlabParsingError",
"GitlabPipelineCancelError",
"GitlabPipelinePlayError",
"GitlabPipelineRetryError",
"GitlabProjectDeployKeyError",
"GitlabPromoteError",
"GitlabProtectError",
"GitlabRenderError",
"GitlabRepairError",
"GitlabRestoreError",
"GitlabRetryError",
"GitlabRevertError",
"GitlabRotateError",
"GitlabSearchError",
"GitlabSetError",
"GitlabStopError",
"GitlabSubscribeError",
"GitlabTimeTrackingError",
"GitlabTodoError",
"GitlabTopicMergeError",
"GitlabTransferProjectError",
"GitlabUnbanError",
"GitlabUnblockError",
"GitlabUnfollowError",
"GitlabUnsubscribeError",
"GitlabUpdateError",
"GitlabUploadError",
"GitlabUserApproveError",
"GitlabUserRejectError",
"GitlabVerifyError",
"RedirectError",
]

File diff suppressed because it is too large Load Diff

View File

View File

@ -0,0 +1,105 @@
import dataclasses
from typing import Any, Dict, List, Optional, Tuple, TYPE_CHECKING
@dataclasses.dataclass(frozen=True)
class RequiredOptional:
required: Tuple[str, ...] = ()
optional: Tuple[str, ...] = ()
exclusive: Tuple[str, ...] = ()
def validate_attrs(
self,
*,
data: Dict[str, Any],
excludes: Optional[List[str]] = None,
) -> None:
if excludes is None:
excludes = []
if self.required:
required = [k for k in self.required if k not in excludes]
missing = [attr for attr in required if attr not in data]
if missing:
raise AttributeError(f"Missing attributes: {', '.join(missing)}")
if self.exclusive:
exclusives = [attr for attr in data if attr in self.exclusive]
if len(exclusives) > 1:
raise AttributeError(
f"Provide only one of these attributes: {', '.join(exclusives)}"
)
if not exclusives:
raise AttributeError(
f"Must provide one of these attributes: "
f"{', '.join(self.exclusive)}"
)
class GitlabAttribute:
def __init__(self, value: Any = None) -> None:
self._value = value
def get(self) -> Any:
return self._value
def set_from_cli(self, cli_value: Any) -> None:
self._value = cli_value
def get_for_api(self, *, key: str) -> Tuple[str, Any]:
return (key, self._value)
class _ListArrayAttribute(GitlabAttribute):
"""Helper class to support `list` / `array` types."""
def set_from_cli(self, cli_value: str) -> None:
if not cli_value.strip():
self._value = []
else:
self._value = [item.strip() for item in cli_value.split(",")]
def get_for_api(self, *, key: str) -> Tuple[str, str]:
# Do not comma-split single value passed as string
if isinstance(self._value, str):
return (key, self._value)
if TYPE_CHECKING:
assert isinstance(self._value, list)
return (key, ",".join([str(x) for x in self._value]))
class ArrayAttribute(_ListArrayAttribute):
"""To support `array` types as documented in
https://docs.gitlab.com/ee/api/#array"""
def get_for_api(self, *, key: str) -> Tuple[str, Any]:
if isinstance(self._value, str):
return (f"{key}[]", self._value)
if TYPE_CHECKING:
assert isinstance(self._value, list)
return (f"{key}[]", self._value)
class CommaSeparatedListAttribute(_ListArrayAttribute):
"""For values which are sent to the server as a Comma Separated Values
(CSV) string. We allow them to be specified as a list and we convert it
into a CSV"""
class LowercaseStringAttribute(GitlabAttribute):
def get_for_api(self, *, key: str) -> Tuple[str, str]:
return (key, str(self._value).lower())
class FileAttribute(GitlabAttribute):
@staticmethod
def get_file_name(attr_name: Optional[str] = None) -> Optional[str]:
return attr_name
class ImageAttribute(FileAttribute):
@staticmethod
def get_file_name(attr_name: Optional[str] = None) -> str:
return f"{attr_name}.png" if attr_name else "image.png"

View File

@ -0,0 +1,303 @@
import dataclasses
import email.message
import logging
import pathlib
import time
import traceback
import urllib.parse
import warnings
from typing import (
Any,
Callable,
Dict,
Iterator,
Literal,
MutableMapping,
Optional,
Tuple,
Type,
Union,
)
import requests
from gitlab import const, types
class _StdoutStream:
def __call__(self, chunk: Any) -> None:
print(chunk)
def get_base_url(url: Optional[str] = None) -> str:
"""Return the base URL with the trailing slash stripped.
If the URL is a Falsy value, return the default URL.
Returns:
The base URL
"""
if not url:
return const.DEFAULT_URL
return url.rstrip("/")
def get_content_type(content_type: Optional[str]) -> str:
message = email.message.Message()
if content_type is not None:
message["content-type"] = content_type
return message.get_content_type()
class MaskingFormatter(logging.Formatter):
"""A logging formatter that can mask credentials"""
def __init__(
self,
fmt: Optional[str] = logging.BASIC_FORMAT,
datefmt: Optional[str] = None,
style: Literal["%", "{", "$"] = "%",
validate: bool = True,
masked: Optional[str] = None,
) -> None:
super().__init__(fmt, datefmt, style, validate)
self.masked = masked
def _filter(self, entry: str) -> str:
if not self.masked:
return entry
return entry.replace(self.masked, "[MASKED]")
def format(self, record: logging.LogRecord) -> str:
original = logging.Formatter.format(self, record)
return self._filter(original)
def response_content(
response: requests.Response,
streamed: bool,
action: Optional[Callable[[bytes], None]],
chunk_size: int,
*,
iterator: bool,
) -> Optional[Union[bytes, Iterator[Any]]]:
if iterator:
return response.iter_content(chunk_size=chunk_size)
if streamed is False:
return response.content
if action is None:
action = _StdoutStream()
for chunk in response.iter_content(chunk_size=chunk_size):
if chunk:
action(chunk)
return None
class Retry:
def __init__(
self,
max_retries: int,
obey_rate_limit: Optional[bool] = True,
retry_transient_errors: Optional[bool] = False,
) -> None:
self.cur_retries = 0
self.max_retries = max_retries
self.obey_rate_limit = obey_rate_limit
self.retry_transient_errors = retry_transient_errors
def _retryable_status_code(
self, status_code: Optional[int], reason: str = ""
) -> bool:
if status_code == 429 and self.obey_rate_limit:
return True
if not self.retry_transient_errors:
return False
if status_code in const.RETRYABLE_TRANSIENT_ERROR_CODES:
return True
if status_code == 409 and "Resource lock" in reason:
return True
return False
def handle_retry_on_status(
self,
status_code: Optional[int],
headers: Optional[MutableMapping[str, str]] = None,
reason: str = "",
) -> bool:
if not self._retryable_status_code(status_code, reason):
return False
if headers is None:
headers = {}
# Response headers documentation:
# https://docs.gitlab.com/ee/user/admin_area/settings/user_and_ip_rate_limits.html#response-headers
if self.max_retries == -1 or self.cur_retries < self.max_retries:
wait_time = 2**self.cur_retries * 0.1
if "Retry-After" in headers:
wait_time = int(headers["Retry-After"])
elif "RateLimit-Reset" in headers:
wait_time = int(headers["RateLimit-Reset"]) - time.time()
self.cur_retries += 1
time.sleep(wait_time)
return True
return False
def handle_retry(self) -> bool:
if self.retry_transient_errors and (
self.max_retries == -1 or self.cur_retries < self.max_retries
):
wait_time = 2**self.cur_retries * 0.1
self.cur_retries += 1
time.sleep(wait_time)
return True
return False
def _transform_types(
data: Dict[str, Any],
custom_types: Dict[str, Any],
*,
transform_data: bool,
transform_files: Optional[bool] = True,
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
"""Copy the data dict with attributes that have custom types and transform them
before being sent to the server.
``transform_files``: If ``True`` (default), also populates the ``files`` dict for
FileAttribute types with tuples to prepare fields for requests' MultipartEncoder:
https://toolbelt.readthedocs.io/en/latest/user.html#multipart-form-data-encoder
``transform_data``: If ``True`` transforms the ``data`` dict with fields
suitable for encoding as query parameters for GitLab's API:
https://docs.gitlab.com/ee/api/#encoding-api-parameters-of-array-and-hash-types
Returns:
A tuple of the transformed data dict and files dict"""
# Duplicate data to avoid messing with what the user sent us
data = data.copy()
if not transform_files and not transform_data:
return data, {}
files = {}
for attr_name, attr_class in custom_types.items():
if attr_name not in data:
continue
gitlab_attribute = attr_class(data[attr_name])
# if the type is FileAttribute we need to pass the data as file
if isinstance(gitlab_attribute, types.FileAttribute) and transform_files:
key = gitlab_attribute.get_file_name(attr_name)
files[attr_name] = (key, data.pop(attr_name))
continue
if not transform_data:
continue
if isinstance(gitlab_attribute, types.GitlabAttribute):
key, value = gitlab_attribute.get_for_api(key=attr_name)
if key != attr_name:
del data[attr_name]
data[key] = value
return data, files
def copy_dict(
*,
src: Dict[str, Any],
dest: Dict[str, Any],
) -> None:
for k, v in src.items():
if isinstance(v, dict):
# NOTE(jlvillal): This provides some support for the `hash` type
# https://docs.gitlab.com/ee/api/#hash
# Transform dict values to new attributes. For example:
# custom_attributes: {'foo', 'bar'} =>
# "custom_attributes['foo']": "bar"
for dict_k, dict_v in v.items():
dest[f"{k}[{dict_k}]"] = dict_v
else:
dest[k] = v
class EncodedId(str):
"""A custom `str` class that will return the URL-encoded value of the string.
* Using it recursively will only url-encode the value once.
* Can accept either `str` or `int` as input value.
* Can be used in an f-string and output the URL-encoded string.
Reference to documentation on why this is necessary.
See::
https://docs.gitlab.com/ee/api/index.html#namespaced-path-encoding
https://docs.gitlab.com/ee/api/index.html#path-parameters
"""
def __new__(cls, value: Union[str, int, "EncodedId"]) -> "EncodedId":
if isinstance(value, EncodedId):
return value
if not isinstance(value, (int, str)):
raise TypeError(f"Unsupported type received: {type(value)}")
if isinstance(value, str):
value = urllib.parse.quote(value, safe="")
return super().__new__(cls, value)
def remove_none_from_dict(data: Dict[str, Any]) -> Dict[str, Any]:
return {k: v for k, v in data.items() if v is not None}
def warn(
message: str,
*,
category: Optional[Type[Warning]] = None,
source: Optional[Any] = None,
show_caller: bool = True,
) -> None:
"""This `warnings.warn` wrapper function attempts to show the location causing the
warning in the user code that called the library.
It does this by walking up the stack trace to find the first frame located outside
the `gitlab/` directory. This is helpful to users as it shows them their code that
is causing the warning.
"""
# Get `stacklevel` for user code so we indicate where issue is in
# their code.
pg_dir = pathlib.Path(__file__).parent.resolve()
stack = traceback.extract_stack()
stacklevel = 1
warning_from = ""
for stacklevel, frame in enumerate(reversed(stack), start=1):
warning_from = f" (python-gitlab: {frame.filename}:{frame.lineno})"
frame_dir = str(pathlib.Path(frame.filename).parent.resolve())
if not frame_dir.startswith(str(pg_dir)):
break
if show_caller:
message += warning_from
warnings.warn(
message=message,
category=category,
stacklevel=stacklevel,
source=source,
)
@dataclasses.dataclass
class WarnMessageData:
message: str
show_caller: bool

Some files were not shown because too many files have changed in this diff Show More