Edgewall Software

source: trunk/trac/util/text.py

Last change on this file was 17657, checked in by Jun Omae, 8 months ago

1.5.4dev: update copyright year to 2023 (refs #13402)

[skip ci]

  • Property svn:eol-style set to native
File size: 27.1 KB
Line 
1# -*- coding: utf-8 -*-
2#
3# Copyright (C) 2003-2023 Edgewall Software
4# Copyright (C) 2003-2004 Jonas Borgström <jonas@edgewall.com>
5# Copyright (C) 2006 Matthew Good <trac@matt-good.net>
6# Copyright (C) 2005-2006 Christian Boos <cboos@edgewall.org>
7# All rights reserved.
8#
9# This software is licensed as described in the file COPYING, which
10# you should have received as part of this distribution. The terms
11# are also available at https://trac.edgewall.org/wiki/TracLicense.
12#
13# This software consists of voluntary contributions made by many
14# individuals. For the exact contribution history, see the revision
15# history and logs, available at https://trac.edgewall.org/log/.
16#
17# Author: Jonas Borgström <jonas@edgewall.com>
18# Matthew Good <trac@matt-good.net>
19# Christian Boos <cboos@edgewall.org>
20
21import base64
22import configparser
23import locale
24import os
25import pkg_resources
26import re
27import sys
28import textwrap
29from urllib.parse import quote, quote_plus, unquote
30from unicodedata import east_asian_width
31
32import jinja2
33
34CRLF = '\r\n'
35
36class Empty(str):
37 """A special tag object evaluating to the empty string"""
38 __slots__ = []
39
40empty = Empty()
41
42del Empty # shouldn't be used outside of Trac core
43
44
45# -- Jinja2
46
47_jinja2_ver = pkg_resources.parse_version(jinja2.__version__)
48_jinja2_exts = ['jinja2.ext.do', 'jinja2.ext.i18n']
49if _jinja2_ver < pkg_resources.parse_version('3'):
50 _jinja2_exts.append('jinja2.ext.with_')
51
52def jinja2env(**kwargs):
53 """Creates a Jinja2 ``Environment`` configured with Trac conventions.
54
55 All default parameters can optionally be overridden. The ``loader``
56 parameter is not set by default, so unless it is set by the
57 caller, only inline templates can be created from the environment.
58
59 :rtype: `jinja.Environment`
60
61 """
62 exts = ('.html', '.rss', '.xml')
63 def filterout_none(v):
64 return '' if v is None else v
65 def autoescape_extensions(template):
66 return template and template.endswith(exts)
67 defaults = dict(
68 variable_start_string='${',
69 variable_end_string='}',
70 line_statement_prefix='#',
71 line_comment_prefix='##',
72 trim_blocks=True,
73 lstrip_blocks=True,
74 extensions=list(_jinja2_exts),
75 finalize=filterout_none,
76 autoescape=autoescape_extensions,
77 )
78 defaults.update(kwargs)
79 jenv = jinja2.Environment(**defaults)
80 jenv.globals.update(
81 len=len,
82 )
83 return jenv
84
85def jinja2template(template, text=False, **kwargs):
86 """Creates a Jinja2 ``Template`` from inlined source.
87
88 :param template: the template content
89 :param text: if set to `False`, the result of the variable
90 expansion will be XML/HTML escaped
91 :param kwargs: additional arguments to pass to `jinja2env`. See
92 `jinja2.Environment` for supported arguments.
93 """
94 return jinja2env(autoescape=not text, **kwargs).from_string(template)
95
96
97# -- Unicode
98
99def to_unicode(text, charset=None):
100 """Convert input to a `str` object.
101
102 For a `bytes` object, we'll first try to decode the bytes using the given
103 `charset` encoding (or UTF-8 if none is specified), then we fall back to
104 the latin1 encoding which might be correct or not, but at least preserves
105 the original byte sequence by mapping each byte to the corresponding
106 unicode code point in the range U+0000 to U+00FF.
107
108 For anything else, a simple `str()` conversion is attempted,
109 with special care taken with `Exception` objects.
110 """
111 if isinstance(text, bytes):
112 try:
113 return str(text, charset or 'utf-8')
114 except UnicodeDecodeError:
115 return str(text, 'latin1')
116 if isinstance(text, Exception):
117 # two possibilities for storing unicode strings in exception data:
118 try:
119 # custom __str__ method on the exception (e.g. PermissionError)
120 result = str(text)
121 except UnicodeError:
122 # unicode arguments given to the exception (e.g. parse_date)
123 return ' '.join(to_unicode(arg) for arg in text.args)
124 if os.name == 'nt':
125 # remove duplicated backslashes from filename in the message
126 if isinstance(text, EnvironmentError) and text.filename:
127 source = repr(text.filename)
128 elif isinstance(text, configparser.ParsingError) and text.source:
129 source = repr(text.source)
130 else:
131 source = None
132 if source:
133 result = result.replace(source, source.replace(r'\\', '\\'))
134 return result
135 return str(text)
136
137
138def exception_to_unicode(e, traceback=False):
139 """Convert an `Exception` to a `str` object.
140
141 In addition to `to_unicode`, this representation of the exception
142 also contains the class name and optionally the traceback.
143 """
144 message = '%s: %s' % (e.__class__.__name__, to_unicode(e))
145 if traceback:
146 from trac.util import get_last_traceback
147 traceback_only = get_last_traceback().split('\n')[:-2]
148 message = '\n%s\n%s' % (to_unicode('\n'.join(traceback_only)), message)
149 return message
150
151
152def path_to_unicode(path):
153 """Convert a filesystem path to str, using the filesystem encoding."""
154 if isinstance(path, bytes):
155 try:
156 return str(path, sys.getfilesystemencoding())
157 except UnicodeDecodeError:
158 return str(path, 'latin1')
159 return str(path)
160
161
162_ws_leading_re = re.compile('\\A[\\s\u200b]+', re.UNICODE)
163_ws_trailing_re = re.compile('[\\s\u200b]+\\Z', re.UNICODE)
164
165def stripws(text, leading=True, trailing=True):
166 """Strips unicode white-spaces and ZWSPs from ``text``.
167
168 :param leading: strips leading spaces from ``text`` unless ``leading`` is
169 `False`.
170 :param trailing: strips trailing spaces from ``text`` unless ``trailing``
171 is `False`.
172 """
173 if leading:
174 text = _ws_leading_re.sub('', text)
175 if trailing:
176 text = _ws_trailing_re.sub('', text)
177 return text
178
179
180def strip_line_ws(text, leading=True, trailing=True):
181 """Strips unicode white-spaces and ZWSPs from each line of ``text``.
182
183 :param leading: strips leading spaces from ``text`` unless ``leading`` is
184 `False`.
185 :param trailing: strips trailing spaces from ``text`` unless ``trailing``
186 is `False`.
187 """
188 lines = re.compile(r'(\n|\r\n|\r)').split(text)
189 if leading:
190 lines[::2] = (_ws_leading_re.sub('', line) for line in lines[::2])
191 if trailing:
192 lines[::2] = (_ws_trailing_re.sub('', line) for line in lines[::2])
193 return ''.join(lines)
194
195
196_js_quote = {'\\': '\\\\', '"': '\\"', '\b': '\\b', '\f': '\\f',
197 '\n': '\\n', '\r': '\\r', '\t': '\\t', "'": "\\'"}
198for i in list(range(0x20)) + [ord(c) for c in '&<>\u2028\u2029']:
199 _js_quote.setdefault(chr(i), '\\u%04x' % i)
200_js_quote_re = re.compile(r'[\x00-\x1f\\"\b\f\n\r\t\'&<>' + '\u2028\u2029]')
201_js_string_re = re.compile(r'[\x00-\x1f\\"\b\f\n\r\t&<>' + '\u2028\u2029]')
202
203
204def javascript_quote(text):
205 """Quote strings for inclusion in single or double quote delimited
206 Javascript strings
207 """
208 if not text:
209 return ''
210 def replace(match):
211 return _js_quote[match.group(0)]
212 return _js_quote_re.sub(replace, text)
213
214
215def to_js_string(text):
216 """Embed the given string in a double quote delimited Javascript string
217 (conform to the JSON spec)
218 """
219 if not text:
220 return '""'
221 def replace(match):
222 return _js_quote[match.group(0)]
223 return '"%s"' % _js_string_re.sub(replace, text)
224
225
226def unicode_quote(value, safe='/'):
227 """A unicode aware version of `urllib.quote`
228
229 :param value: anything that converts to a `bytes`. If `str`
230 input is given, it will be UTF-8 encoded.
231 :param safe: as in `quote`, the characters that would otherwise be
232 quoted but shouldn't here (defaults to '/')
233 """
234 return quote(value if isinstance(value, bytes) else str(value), safe)
235
236
237def unicode_quote_plus(value, safe=''):
238 """A unicode aware version of `urllib.quote_plus`.
239
240 :param value: anything that converts to a `bytes`. If `str`
241 input is given, it will be UTF-8 encoded.
242 :param safe: as in `quote_plus`, the characters that would
243 otherwise be quoted but shouldn't here (defaults to
244 '/')
245 """
246 return quote_plus(value if isinstance(value, bytes) else str(value), safe)
247
248
249def unicode_unquote(value):
250 """A unicode aware version of `urllib.unquote`.
251
252 :param value: UTF-8 encoded `str` value (for example, as obtained by
253 `unicode_quote`).
254 :rtype: `str`
255 """
256 if isinstance(value, bytes):
257 value = value.decode('latin1')
258 return unquote(value, encoding='utf-8', errors='strict')
259
260
261def unicode_urlencode(params, safe=''):
262 """A unicode aware version of `urllib.urlencode`.
263
264 Values set to `empty` are converted to the key alone, without the
265 equal sign.
266 """
267 if isinstance(params, dict):
268 params = sorted(params.items(), key=lambda i: i[0])
269 l = []
270 for k, v in params:
271 if v is empty:
272 l.append(unicode_quote_plus(k, safe))
273 else:
274 l.append(unicode_quote_plus(k, safe) + '=' +
275 unicode_quote_plus(v, safe))
276 return '&'.join(l)
277
278
279_qs_quote_safe = ''.join(chr(c) for c in range(0x21, 0x7f))
280
281def quote_query_string(text):
282 """Quote strings for query string
283 """
284 return unicode_quote_plus(text, _qs_quote_safe)
285
286
287def to_utf8(text, charset='latin1'):
288 """Convert input to a UTF-8 `bytes` object.
289
290 If the input is not an `str` object, we assume the encoding is
291 already UTF-8, ISO Latin-1, or as specified by the optional
292 *charset* parameter.
293 """
294 if isinstance(text, bytes):
295 try:
296 u = str(text, 'utf-8')
297 except UnicodeError:
298 try:
299 # Use the user supplied charset if possible
300 u = str(text, charset)
301 except UnicodeError:
302 # This should always work
303 u = str(text, 'latin1')
304 else:
305 # Do nothing if it's already utf-8
306 return text
307 else:
308 u = to_unicode(text)
309 return u.encode('utf-8')
310
311
312class unicode_passwd(str):
313 """Conceal the actual content of the string when `repr` is called."""
314 def __repr__(self):
315 return '*******'
316
317
318def stream_encoding(stream):
319 """Return the appropriate encoding for the given stream."""
320 encoding = getattr(stream, 'encoding', None)
321 # Windows returns 'cp0' to indicate no encoding
322 return encoding if encoding not in (None, 'cp0') else 'utf-8'
323
324
325def console_print(out, *args, **kwargs):
326 """Output the given arguments to the console, encoding the output
327 as appropriate.
328
329 :param kwargs: ``newline`` controls whether a newline will be appended
330 (defaults to `True`)
331 """
332 out.write(' '.join(to_unicode(a) for a in args))
333 if kwargs.get('newline', True):
334 out.write('\n')
335
336
337def printout(*args, **kwargs):
338 """Do a `console_print` on `sys.stdout`."""
339 console_print(sys.stdout, *args, **kwargs)
340
341
342def printerr(*args, **kwargs):
343 """Do a `console_print` on `sys.stderr`."""
344 console_print(sys.stderr, *args, **kwargs)
345
346
347def printfout(message, *args, **kwargs):
348 """Format `message`, do a `console.print` on `sys.stdout` and flush
349 the buffer.
350 """
351 if args:
352 message %= args
353 printout(message, **kwargs)
354 sys.stdout.flush()
355
356
357def printferr(message, *args, **kwargs):
358 """Format `message`, do a `console.print` on `sys.stderr` and flush
359 the buffer.
360 """
361 if args:
362 message %= args
363 printerr(message, **kwargs)
364 sys.stderr.flush()
365
366
367def raw_input(prompt):
368 """Input one line from the console and converts it to unicode as
369 appropriate.
370 """
371 printout(prompt, newline=False)
372 return to_unicode(input(), sys.stdin.encoding)
373
374
375_preferredencoding = locale.getpreferredencoding()
376
377def getpreferredencoding():
378 """Return the encoding, which is retrieved on ahead, according to user
379 preference.
380
381 We should use this instead of `locale.getpreferredencoding()` which
382 is not thread-safe."""
383 return _preferredencoding
384
385
386# -- Plain text formatting
387
388def text_width(text, ambiwidth=1):
389 """Determine the column width of `text` in Unicode characters.
390
391 The characters in the East Asian Fullwidth (F) or East Asian Wide (W)
392 have a column width of 2. The other characters in the East Asian
393 Halfwidth (H) or East Asian Narrow (Na) have a column width of 1.
394
395 That `ambiwidth` parameter is used for the column width of the East
396 Asian Ambiguous (A). If `1`, the same width as characters in US-ASCII.
397 This is expected by most users. If `2`, twice the width of US-ASCII
398 characters. This is expected by CJK users.
399
400 cf. http://www.unicode.org/reports/tr11/.
401 """
402 twice = 'FWA' if ambiwidth == 2 else 'FW'
403 return sum([2 if east_asian_width(chr) in twice else 1
404 for chr in to_unicode(text)])
405
406
407def _get_default_ambiwidth():
408 """Return width of East Asian Ambiguous based on locale environment
409 variables or Windows codepage.
410 """
411
412 if os.name == 'nt':
413 import ctypes
414 codepage = ctypes.windll.kernel32.GetConsoleOutputCP()
415 if codepage in (932, # Japanese (Shift-JIS)
416 936, # Chinese Simplified (GB2312)
417 949, # Korean (Unified Hangul Code)
418 950): # Chinese Traditional (Big5)
419 return 2
420 else:
421 for name in ('LANGUAGE', 'LC_ALL', 'LC_MESSAGES', 'LANG'):
422 value = os.environ.get(name) or ''
423 if value:
424 if name == 'LANGUAGE' and ':' in value:
425 value = value.split(':')[0]
426 return 2 if value.lower().startswith(('zh', 'ja', 'ko')) else 1
427
428 return 1
429
430
431_default_ambiwidth = _get_default_ambiwidth()
432
433
434def print_table(data, headers=None, sep=' ', out=None, ambiwidth=None):
435 """Print data according to a tabular layout.
436
437 :param data: a sequence of rows; assume all rows are of equal length.
438 :param headers: an optional row containing column headers; must be of
439 the same length as each row in `data`.
440 :param sep: column separator
441 :param out: output file descriptor (`None` means use `sys.stdout`)
442 :param ambiwidth: column width of the East Asian Ambiguous (A). If None,
443 detect ambiwidth with the locale settings. If others,
444 pass to the `ambiwidth` parameter of `text_width`.
445 """
446 if out is None:
447 out = sys.stdout
448 if ambiwidth is None:
449 ambiwidth = _default_ambiwidth
450 data = list(data)
451 if headers:
452 data.insert(0, headers)
453 elif not data:
454 return
455
456 # Convert to a str object with `to_unicode`. If None, convert to a
457 # empty string.
458 def to_text(val):
459 if val is None:
460 return ''
461 return to_unicode(val)
462
463 def tw(text):
464 return text_width(text, ambiwidth=ambiwidth)
465
466 def to_lines(data):
467 lines = []
468 for row in data:
469 row = [to_text(cell) for cell in row]
470 if any('\n' in cell for cell in row):
471 row = [cell.splitlines() for cell in row]
472 max_lines = max(len(cell) for cell in row)
473 for cell in row:
474 if len(cell) < max_lines:
475 cell += [''] * (max_lines - len(cell))
476 lines.extend([cell[idx] for cell in row]
477 for idx in range(max_lines))
478 else:
479 lines.append(row)
480 return lines
481
482 data = to_lines(data)
483
484 num_cols = len(data[0])
485 col_width = [max(tw(row[idx]) for row in data)
486 for idx in range(num_cols)]
487
488 out.write('\n')
489 for ridx, row in enumerate(data):
490 for cidx, cell in enumerate(row):
491 if cidx + 1 == num_cols:
492 line = cell # No separator after last column
493 else:
494 if headers and ridx == 0:
495 sp = ' ' * tw(sep) # No separator in header
496 else:
497 sp = sep
498 line = '%-*s%s' % (col_width[cidx] - tw(cell) + len(cell),
499 cell, sp)
500 out.write(line)
501
502 out.write('\n')
503 if ridx == 0 and headers:
504 out.write('-' * (tw(sep) * cidx + sum(col_width)))
505 out.write('\n')
506 out.write('\n')
507
508
509def shorten_line(text, maxlen=75):
510 """Truncates `text` to length less than or equal to `maxlen` characters.
511
512 This tries to be (a bit) clever and attempts to find a proper word
513 boundary for doing so.
514 """
515 if len(text or '') <= maxlen:
516 return text
517 suffix = ' ...'
518 maxtextlen = maxlen - len(suffix)
519 cut = max(text.rfind(' ', 0, maxtextlen), text.rfind('\n', 0, maxtextlen))
520 if cut < 0:
521 cut = maxtextlen
522 return text[:cut] + suffix
523
524
525class UnicodeTextWrapper(textwrap.TextWrapper):
526 breakable_char_ranges = [
527 (0x1100, 0x11FF), # Hangul Jamo
528 (0x2E80, 0x2EFF), # CJK Radicals Supplement
529 (0x3000, 0x303F), # CJK Symbols and Punctuation
530 (0x3040, 0x309F), # Hiragana
531 (0x30A0, 0x30FF), # Katakana
532 (0x3130, 0x318F), # Hangul Compatibility Jamo
533 (0x3190, 0x319F), # Kanbun
534 (0x31C0, 0x31EF), # CJK Strokes
535 (0x3200, 0x32FF), # Enclosed CJK Letters and Months
536 (0x3300, 0x33FF), # CJK Compatibility
537 (0x3400, 0x4DBF), # CJK Unified Ideographs Extension A
538 (0x4E00, 0x9FFF), # CJK Unified Ideographs
539 (0xA960, 0xA97F), # Hangul Jamo Extended-A
540 (0xAC00, 0xD7AF), # Hangul Syllables
541 (0xD7B0, 0xD7FF), # Hangul Jamo Extended-B
542 (0xF900, 0xFAFF), # CJK Compatibility Ideographs
543 (0xFE30, 0xFE4F), # CJK Compatibility Forms
544 (0xFF00, 0xFFEF), # Halfwidth and Fullwidth Forms
545 (0x20000, 0x2FFFF, '[\uD840-\uD87F][\uDC00-\uDFFF]'), # Plane 2
546 (0x30000, 0x3FFFF, '[\uD880-\uD8BF][\uDC00-\uDFFF]'), # Plane 3
547 ]
548
549 split_re = None
550 breakable_re = None
551
552 @classmethod
553 def _init_patterns(cls):
554 char_ranges = []
555 for val in cls.breakable_char_ranges:
556 high = chr(val[0])
557 low = chr(val[1])
558 char_ranges.append('%s-%s' % (high, low))
559 char_ranges = ''.join(char_ranges)
560 pattern = '[%s]+' % char_ranges
561
562 cls.split_re = re.compile(
563 r'(\s+|' + # any whitespace
564 pattern + '|' + # breakable text
565 r'[^\s\w]*\w+[^0-9\W]-(?=\w+[^0-9\W])|' + # hyphenated words
566 r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w))', # em-dash
567 re.UNICODE)
568 cls.breakable_re = re.compile(r'\A' + pattern, re.UNICODE)
569
570 def __init__(self, cols, replace_whitespace=0, break_long_words=0,
571 initial_indent='', subsequent_indent='', ambiwidth=1):
572 textwrap.TextWrapper.__init__(
573 self, cols, replace_whitespace=0, break_long_words=0,
574 initial_indent=initial_indent,
575 subsequent_indent=subsequent_indent)
576 self.ambiwidth = ambiwidth
577 if self.split_re is None:
578 self._init_patterns()
579
580 def _split(self, text):
581 chunks = self.split_re.split(to_unicode(text))
582 return list(filter(None, chunks))
583
584 def _text_width(self, text):
585 return text_width(text, ambiwidth=self.ambiwidth)
586
587 def _wrap_chunks(self, chunks):
588 lines = []
589 chunks.reverse()
590 text_width = self._text_width
591
592 while chunks:
593 cur_line = []
594 cur_width = 0
595
596 if lines:
597 indent = self.subsequent_indent
598 else:
599 indent = self.initial_indent
600 width = self.width - text_width(indent)
601
602 if chunks[-1].strip() == '' and lines:
603 del chunks[-1]
604
605 while chunks:
606 chunk = chunks[-1]
607 w = text_width(chunk)
608 if cur_width + w <= width:
609 cur_line.append(chunks.pop())
610 cur_width += w
611 elif self.breakable_re.match(chunk):
612 left_space = width - cur_width
613 for i in range(len(chunk)):
614 w = text_width(chunk[i])
615 if left_space < w:
616 break
617 left_space -= w
618 if i > 0:
619 cur_line.append(chunk[:i])
620 chunk = chunk[i:]
621 chunks[-1] = chunk
622 w = text_width(chunk)
623 break
624 else:
625 break
626
627 if chunks and w > width:
628 self._handle_long_word(chunks, cur_line, cur_width, width)
629
630 if cur_line and cur_line[-1].strip() == '':
631 del cur_line[-1]
632
633 if cur_line:
634 lines.append(indent + ''.join(cur_line))
635
636 return lines
637
638
639def wrap(t, cols=75, initial_indent='', subsequent_indent='',
640 linesep=os.linesep, ambiwidth=1):
641 """Wraps the single paragraph in `t`, which contains unicode characters.
642 The every line is at most `cols` characters long.
643
644 That `ambiwidth` parameter is used for the column width of the East
645 Asian Ambiguous (A). If `1`, the same width as characters in US-ASCII.
646 This is expected by most users. If `2`, twice the width of US-ASCII
647 characters. This is expected by CJK users.
648 """
649 t = t.strip().replace('\r\n', '\n').replace('\r', '\n')
650 wrapper = UnicodeTextWrapper(cols, replace_whitespace=0,
651 break_long_words=0,
652 initial_indent=initial_indent,
653 subsequent_indent=subsequent_indent,
654 ambiwidth=ambiwidth)
655 wrappedLines = []
656 for line in t.split('\n'):
657 wrappedLines += wrapper.wrap(line.rstrip()) or ['']
658 return linesep.join(wrappedLines)
659
660
661_obfuscation_char = '@\u2026'
662
663def obfuscate_email_address(address):
664 """Replace anything looking like an e-mail address (``'@something'``)
665 with a trailing ellipsis (``'@…'``)
666 """
667 if address:
668 at = address.find('@')
669 if at != -1:
670 return address[:at] + _obfuscation_char + \
671 ('>' if address[-1] == '>' else '')
672 return address
673
674
675def is_obfuscated(word):
676 """Returns `True` if the `word` looks like an obfuscated e-mail
677 address.
678
679 :since: 1.2
680 """
681 return _obfuscation_char in word
682
683
684def breakable_path(path):
685 """Make a path breakable after path separators, and conversely, avoid
686 breaking at spaces.
687 """
688 if not path:
689 return path
690 prefix = ''
691 if path.startswith('/'): # Avoid breaking after a leading /
692 prefix = '/'
693 path = path[1:]
694 return prefix + path.replace('/', '/\u200b').replace('\\', '\\\u200b') \
695 .replace(' ', '\u00a0')
696
697
698def normalize_whitespace(text, to_space='\u00a0', remove='\u200b'):
699 """Normalize whitespace in a string, by replacing special spaces by normal
700 spaces and removing zero-width spaces."""
701 if not text:
702 return text
703 for each in to_space:
704 text = text.replace(each, ' ')
705 for each in remove:
706 text = text.replace(each, '')
707 return text
708
709
710def unquote_label(txt):
711 """Remove (one level of) enclosing single or double quotes.
712
713 .. versionadded :: 1.0
714 """
715 return txt[1:-1] if txt and txt[0] in "'\"" and txt[0] == txt[-1] else txt
716
717
718def cleandoc(message):
719 """Removes uniform indentation and leading/trailing whitespace."""
720 from inspect import cleandoc
721 return cleandoc(message).strip()
722
723
724# -- Conversion
725
726def pretty_size(size, format='%.1f'):
727 """Pretty print content size information with appropriate unit.
728
729 :param size: number of bytes
730 :param format: can be used to adjust the precision shown
731 """
732 if size is None:
733 return ''
734
735 jump = 1024
736 if size < jump:
737 from trac.util.translation import ngettext
738 return ngettext("%(num)d byte", "%(num)d bytes", num=size)
739
740 units = ['KB', 'MB', 'GB', 'TB']
741 i = 0
742 while size >= jump and i < len(units):
743 i += 1
744 size /= 1024.
745
746 return (format + ' %s') % (size, units[i - 1])
747
748
749def expandtabs(s, tabstop=8, ignoring=None):
750 """Expand tab characters `'\\\\t'` into spaces.
751
752 :param tabstop: number of space characters per tab
753 (defaults to the canonical 8)
754
755 :param ignoring: if not `None`, the expansion will be "smart" and
756 go from one tabstop to the next. In addition,
757 this parameter lists characters which can be
758 ignored when computing the indent.
759 """
760 if '\t' not in s:
761 return s
762 if ignoring is None:
763 return s.expandtabs(tabstop)
764
765 outlines = []
766 for line in s.split('\n'):
767 if '\t' not in line:
768 outlines.append(line)
769 continue
770 p = 0
771 s = []
772 for c in line:
773 if c == '\t':
774 n = tabstop - p % tabstop
775 s.append(' ' * n)
776 p += n
777 elif not ignoring or c not in ignoring:
778 p += 1
779 s.append(c)
780 else:
781 s.append(c)
782 outlines.append(''.join(s))
783 return '\n'.join(outlines)
784
785
786def fix_eol(text, eol):
787 """Fix end-of-lines in a text."""
788 lines = text.splitlines()
789 if isinstance(text, bytes):
790 last = b''
791 eol = eol.encode('utf-8')
792 else:
793 last = ''
794 lines.append(last)
795 return eol.join(lines)
796
797def unicode_to_base64(text, strip_newlines=True):
798 """Safe conversion of ``text`` to base64 representation using
799 utf-8 bytes.
800
801 Strips newlines from output unless ``strip_newlines`` is `False`.
802 """
803 text = to_unicode(text)
804 text = text.encode('utf-8')
805 if strip_newlines:
806 rv = base64.b64encode(text)
807 else:
808 rv = base64.encodebytes(text)
809 return str(rv, 'ascii')
810
811def unicode_from_base64(text):
812 """Safe conversion of ``text`` to str based on utf-8 bytes."""
813 return str(base64.b64decode(text), 'utf-8')
814
815
816def levenshtein_distance(lhs, rhs):
817 """Return the Levenshtein distance between two strings."""
818 if len(lhs) > len(rhs):
819 rhs, lhs = lhs, rhs
820 if not lhs:
821 return len(rhs)
822
823 prev = range(len(rhs) + 1)
824 for lidx, lch in enumerate(lhs):
825 curr = [lidx + 1]
826 for ridx, rch in enumerate(rhs):
827 cost = (lch != rch) * 2
828 curr.append(min(prev[ridx + 1] + 1, # deletion
829 curr[ridx] + 1, # insertion
830 prev[ridx] + cost)) # substitution
831 prev = curr
832 return prev[-1]
833
834
835sub_vars_re = re.compile("[$]([A-Z_][A-Z0-9_]*)")
836
837def sub_vars(text, args):
838 """Substitute $XYZ-style variables in a string with provided values.
839
840 :param text: string containing variables to substitute.
841 :param args: dictionary with keys matching the variables to be substituted.
842 The keys should not be prefixed with the $ character."""
843 def repl(match):
844 key = match.group(1)
845 return args[key] if key in args else '$' + key
846 return sub_vars_re.sub(repl, text)
Note: See TracBrowser for help on using the repository browser.