[formatter] implement slicing strings as bytes (#4087)

prefixing a slice '[10:30]' with a lowercase b '[b10:30]' encodes the string to bytes in filesystem encoding before applying the slice
2024-11-22 10:42:34 +01:00 · 2023-05-22 18:30:45 +02:00 · 2023-05-22 18:30:45 +02:00 · 69865dcc05
commit 69865dcc05
parent 56b8b8cd36
3 changed files with 66 additions and 16 deletions
--- a/docs/formatting.md
+++ b/docs/formatting.md
@ -11,14 +11,15 @@ Field names select the metadata value to use in a replacement field.
 While simple names are usually enough, more complex forms like accessing values by attribute, element index, or slicing are also supported.
-|                      | Example           | Result                 |
+|                      | Example             | Result                 |
-| -------------------- | ----------------- | ---------------------- |
+| -------------------- | ------------------- | ---------------------- |
-| Name                 | `{title}`         | `Hello World`          |
+| Name                 | `{title}`           | `Hello World`          |
-| Element Index        | `{title[6]}`      | `W`                    |
+| Element Index        | `{title[6]}`        | `W`                    |
-| Slicing              | `{title[3:8]}`    | `lo Wo`                |
+| Slicing              | `{title[3:8]}`      | `lo Wo`                |
-| Alternatives         | `{empty\|title}`  | `Hello World`          |
+| Slicing (Bytes)      | `{title_ja[b3:18]}` | `ロー・ワー`           |
-| Element Access       | `{user[name]}`    | `John Doe`             |
+| Alternatives         | `{empty\|title}`    | `Hello World`          |
-| Attribute Access     | `{extractor.url}` | `https://example.org/` |
+| Element Access       | `{user[name]}`      | `John Doe`             |
 | Attribute Access     | `{extractor.url}`   | `https://example.org/` |
 All of these methods can be combined as needed.
 For example `{title[24]|empty|extractor.url[15:-1]}` would result in `.org`.
@ -150,6 +151,12 @@ Format specifiers can be used for advanced formatting by using the options provi
    <td><code>{foo:[1:-1]}</code></td>
    <td><code>oo&nbsp;Ba</code></td>
 </tr>
 <tr>
    <td><code>[b&lt;start&gt;:&lt;stop&gt;]</code></td>
    <td>Same as above, but applies to the <a href="https://docs.python.org/3/library/stdtypes.html#bytes"><code>bytes()</code></a> representation of a string in <a href="https://docs.python.org/3/library/sys.html#sys.getfilesystemencoding">filesystem encoding</a></td>
    <td><code>{foo_ja:[b3:-1]}</code></td>
    <td><code>ー・バ</code></td>
 </tr>
 <tr>
    <td rowspan="2"><code>L&lt;maxlen&gt;/&lt;repl&gt;/</code></td>
    <td rowspan="2">Replaces the entire output with <code>&lt;repl&gt;</code> if its length exceeds <code>&lt;maxlen&gt;</code></td>
--- a/gallery_dl/formatter.py
+++ b/gallery_dl/formatter.py
@ -9,6 +9,7 @@
 """String formatters"""
 import os
 import sys
 import time
 import string
 import _string
@ -255,7 +256,11 @@ def parse_field_name(field_name):
            func = operator.itemgetter
            try:
                if ":" in key:
-                    key = _slice(key)
+                    if key[0] == "b":
                        func = _bytesgetter
                        key = _slice(key[1:])
                    else:
                        key = _slice(key)
                else:
                    key = key.strip("\"'")
            except TypeError:
@ -276,6 +281,14 @@ def _slice(indices):
    )
 def _bytesgetter(slice, encoding=sys.getfilesystemencoding()):
    def apply_slice_bytes(obj):
        return obj.encode(encoding)[slice].decode(encoding, "ignore")
    return apply_slice_bytes
 def _build_format_func(format_spec, default):
    if format_spec:
        return _FORMAT_SPECIFIERS.get(
@ -295,11 +308,20 @@ def _parse_optional(format_spec, default):
 def _parse_slice(format_spec, default):
    indices, _, format_spec = format_spec.partition("]")
    slice = _slice(indices[1:])
    fmt = _build_format_func(format_spec, default)
-    def apply_slice(obj):
+    if indices[1] == "b":
-        return fmt(obj[slice])
+        slice_bytes = _bytesgetter(_slice(indices[2:]))
        def apply_slice(obj):
            return fmt(slice_bytes(obj))
    else:
        slice = _slice(indices[1:])
        def apply_slice(obj):
            return fmt(obj[slice])
    return apply_slice
--- a/test/test_formatter.py
+++ b/test/test_formatter.py
@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
-# Copyright 2021-2022 Mike Fährmann
+# Copyright 2021-2023 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@ -23,6 +23,7 @@ class TestFormatter(unittest.TestCase):
    kwdict = {
        "a": "hElLo wOrLd",
        "b": "äöü",
        "j": "げんそうきょう",
        "d": {"a": "foo", "b": 0, "c": None},
        "l": ["a", "b", "c"],
        "n": None,
@ -133,7 +134,7 @@ class TestFormatter(unittest.TestCase):
        self._run_test("{d['a']}", "foo")
        self._run_test('{d["a"]}', "foo")
-    def test_slicing(self):
+    def test_slice_str(self):
        v = self.kwdict["a"]
        self._run_test("{a[1:10]}"  , v[1:10])
        self._run_test("{a[-10:-1]}", v[-10:-1])
@ -165,6 +166,26 @@ class TestFormatter(unittest.TestCase):
        self._run_test("{a:[:50:2]}", v[:50:2])
        self._run_test("{a:[::]}"   , v)
    def test_slice_bytes(self):
        v = self.kwdict["j"]
        self._run_test("{j[b1:10]}"  , v[1:3])
        self._run_test("{j[b-10:-1]}", v[-3:-1])
        self._run_test("{j[b5:]}"    , v[2:])
        self._run_test("{j[b50:]}"   , v[50:])
        self._run_test("{j[b:5]}"    , v[:1])
        self._run_test("{j[b:50]}"   , v[:50])
        self._run_test("{j[b:]}"     , v)
        self._run_test("{j[b::]}"    , v)
        self._run_test("{j:[b1:10]}"  , v[1:3])
        self._run_test("{j:[b-10:-1]}", v[-3:-1])
        self._run_test("{j:[b5:]}"    , v[2:])
        self._run_test("{j:[b50:]}"   , v[50:])
        self._run_test("{j:[b:5]}"    , v[:1])
        self._run_test("{j:[b:50]}"   , v[:50])
        self._run_test("{j:[b:]}"     , v)
        self._run_test("{j:[b::]}"    , v)
    def test_maxlen(self):
        v = self.kwdict["a"]
        self._run_test("{a:L5/foo/}" , "foo")
@ -413,10 +434,10 @@ def noarg():
            fmt4 = formatter.parse("\fM " + path + ":lengths")
        self.assertEqual(fmt1.format_map(self.kwdict), "'Title' by Name")
-        self.assertEqual(fmt2.format_map(self.kwdict), "89")
+        self.assertEqual(fmt2.format_map(self.kwdict), "96")
        self.assertEqual(fmt3.format_map(self.kwdict), "'Title' by Name")
-        self.assertEqual(fmt4.format_map(self.kwdict), "89")
+        self.assertEqual(fmt4.format_map(self.kwdict), "96")
        with self.assertRaises(TypeError):
            self.assertEqual(fmt0.format_map(self.kwdict), "")