# -*- coding: utf-8 -*- # Copyright 2015 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Collection of functions that work in strings/text""" import sys import re import os.path import html.parser import urllib.parse import platform def remove_html(text): """Remove html-tags from a string""" return " ".join(re.sub("<[^>]+?>", " ", text).split()) def filename_from_url(url): """Extract the last part of an url to use as a filename""" try: path = urllib.parse.urlparse(url).path pos = path.rindex("/") return path[pos+1:] except ValueError: return url def clean_path_windows(path): """Remove illegal characters from a path-segment (Windows)""" try: return re.sub(r'[<>:"\\/|?*]', "_", path) except TypeError: return path def clean_path_posix(path): """Remove illegal characters from a path-segment (Posix)""" try: return path.replace("/", "_") except AttributeError: return path def shorten_path(path, limit=255, encoding=sys.getfilesystemencoding()): """Shorten a path segment to at most 'limit' bytes""" return (path.encode(encoding)[:limit]).decode(encoding, "ignore") def shorten_filename(filename, limit=255, encoding=sys.getfilesystemencoding()): """Shorten a filename to at most 'limit' bytes while preserving extension""" name, extension = os.path.splitext(filename) bext = extension.encode(encoding) bname = name.encode(encoding)[:limit-len(bext)] return bname.decode(encoding, "ignore") + extension def extract(txt, begin, end, pos=0): """Extract the text between 'begin' and 'end' from 'txt' Args: txt: String to search in begin: First string to be searched for end: Second string to be searched for after 'begin' pos: Starting position for searches in 'txt' Returns: The string between the two search-strings 'begin' and 'end' beginning with position 'pos' in 'txt' as well as the position after 'end'. If at least one of 'begin' or 'end' is not found, None and the original value of 'pos' is returned Examples: extract("abcde", "b", "d") -> "c" , 4 extract("abcde", "b", "d", 3) -> None, 3 """ try: first = txt.index(begin, pos) + len(begin) last = txt.index(end, first) return txt[first:last], last+len(end) except ValueError: return None, pos def extract_all(txt, rules, pos=0, values=None): """Calls extract for each rule and returns the result in a dict""" if values is None: values = {} for key, begin, end in rules: result, pos = extract(txt, begin, end, pos) if key: values[key] = result return values, pos if platform.system() == "Windows": clean_path = clean_path_windows else: clean_path = clean_path_posix unquote = urllib.parse.unquote unescape = html.parser.HTMLParser().unescape