gcc-changelog: workaround for utf8 filenames

contrib/ChangeLog:

	* gcc-changelog/git_commit.py: Add decode_path function.
	* gcc-changelog/git_email.py: Use it in order to solve
	utf8 encoding filename issues.
	* gcc-changelog/git_repository.py: Likewise.
	* gcc-changelog/test_email.py: Test it.
This commit is contained in:
Martin Liska 2021-01-06 08:11:57 +01:00
parent ac3966e315
commit 57706dd7e0
4 changed files with 26 additions and 15 deletions

View file

@ -174,6 +174,24 @@ REVIEW_PREFIXES = ('reviewed-by: ', 'reviewed-on: ', 'signed-off-by: ',
DATE_FORMAT = '%Y-%m-%d'
def decode_path(path):
# When core.quotepath is true (default value), utf8 chars are encoded like:
# "b/ko\304\215ka.txt"
#
# The upstream bug is fixed:
# https://github.com/gitpython-developers/GitPython/issues/1099
#
# but we still need a workaround for older versions of the library.
# Please take a look at the explanation of the transformation:
# https://stackoverflow.com/questions/990169/how-do-convert-unicode-escape-sequences-to-unicode-characters-in-a-python-string
if path.startswith('"') and path.endswith('"'):
return (path.strip('"').encode('utf8').decode('unicode-escape')
.encode('latin-1').decode('utf8'))
else:
return path
class Error:
def __init__(self, message, line=None):
self.message = message
@ -303,14 +321,6 @@ class GitCommit:
'separately from normal commits'))
return
# check for an encoded utf-8 filename
hint = 'git config --global core.quotepath false'
for modified, _ in self.info.modified_files:
if modified.startswith('"') or modified.endswith('"'):
self.errors.append(Error('Quoted UTF8 filename, please set: '
f'"{hint}"', modified))
return
all_are_ignored = (len(project_files) + len(ignored_files)
== len(self.info.modified_files))
self.parse_lines(all_are_ignored)

View file

@ -22,7 +22,7 @@ from itertools import takewhile
from dateutil.parser import parse
from git_commit import GitCommit, GitInfo
from git_commit import GitCommit, GitInfo, decode_path
from unidiff import PatchSet, PatchedFile
@ -52,8 +52,8 @@ class GitEmail(GitCommit):
modified_files = []
for f in diff:
# Strip "a/" and "b/" prefixes
source = f.source_file[2:]
target = f.target_file[2:]
source = decode_path(f.source_file)[2:]
target = decode_path(f.target_file)[2:]
if f.is_added_file:
t = 'A'

View file

@ -26,7 +26,7 @@ except ImportError:
print(' Debian, Ubuntu: python3-git')
exit(1)
from git_commit import GitCommit, GitInfo
from git_commit import GitCommit, GitInfo, decode_path
def parse_git_revisions(repo_path, revisions, strict=True):
@ -51,11 +51,11 @@ def parse_git_revisions(repo_path, revisions, strict=True):
# Consider that renamed files are two operations:
# the deletion of the original name
# and the addition of the new one.
modified_files.append((file.a_path, 'D'))
modified_files.append((decode_path(file.a_path), 'D'))
t = 'A'
else:
t = 'M'
modified_files.append((file.b_path, t))
modified_files.append((decode_path(file.b_path), t))
date = datetime.utcfromtimestamp(c.committed_date)
author = '%s <%s>' % (c.author.name, c.author.email)

View file

@ -402,4 +402,5 @@ class TestGccChangelog(unittest.TestCase):
def test_bad_unicode_chars_in_filename(self):
email = self.from_patch_glob('0001-Add-horse2.patch')
assert email.errors[0].message.startswith('Quoted UTF8 filename')
assert not email.errors
assert email.changelog_entries[0].files == ['koníček.txt']