source: main/waeup.kofa/trunk/src/waeup/kofa/utils/helpers.py @ 16545

Last change on this file since 16545 was 15748, checked in by uli, 5 years ago

Bad workaround to cope with overcrowded catalogs.

Zope does not cope well with catalogs of more than 500.000 elements.
Reindexing the catalog then means to put all objects into memory before
saving them - this can blow everything.

We therefore use a dirty hack to replace the updateIndex method of
a catalog with a more careful function, that, however, should not be
run in production mode. No new items should be added during the run.

In the long run, we certainly need something more sustainable.

  • Property svn:keywords set to Id
File size: 29.8 KB
RevLine 
[7196]1## $Id: helpers.py 15748 2019-11-04 10:15:44Z uli $
2##
3## Copyright (C) 2011 Uli Fouquet & Henrik Bettermann
4## This program is free software; you can redistribute it and/or modify
5## it under the terms of the GNU General Public License as published by
6## the Free Software Foundation; either version 2 of the License, or
7## (at your option) any later version.
8##
9## This program is distributed in the hope that it will be useful,
10## but WITHOUT ANY WARRANTY; without even the implied warranty of
11## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12## GNU General Public License for more details.
13##
14## You should have received a copy of the GNU General Public License
15## along with this program; if not, write to the Free Software
16## Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17##
[7819]18"""General helper functions for Kofa.
[4188]19"""
[10677]20import unicodecsv as csv  # XXX: csv ops should move to dedicated module.
[8185]21import datetime
[8466]22import imghdr
[9593]23import logging
[4188]24import os
[8185]25import pytz
[4375]26import re
[4188]27import shutil
[8631]28import tempfile
[15748]29import transaction
[5731]30import grok
[5848]31from cStringIO import StringIO
32from docutils.core import publish_string
[15595]33from HTMLParser import HTMLParser
[7943]34from zope.component import getUtility
[5731]35from zope.component.interfaces import IFactory
[5734]36from zope.interface import implementedBy
[7941]37from zope.interface.interface import Method, Attribute
[15739]38from zope.intid.interfaces import IIntIds
[6071]39from zope.schema import getFieldNames
40from zope.schema.fieldproperty import FieldProperty
[6372]41from zope.security.interfaces import NoInteraction
42from zope.security.management import getInteraction
[7175]43from zope.pluggableauth.interfaces import IAuthenticatorPlugin
[12231]44from zope.formlib.widget import renderElement
[4188]45
[15739]46
[6503]47BUFSIZE = 8 * 1024
[6372]48
[10677]49
[7186]50def remove_file_or_directory(filepath):
[4188]51    """Remove a file or directory.
[5738]52
53    Different to :func:`shutil.rmtree` we also accept not existing
54    paths (returning silently) and if a dir turns out to be a regular
55    file, we remove that.
[4188]56    """
57    filepath = os.path.abspath(filepath)
58    if not os.path.exists(filepath):
59        return
60    if os.path.isdir(filepath):
61        shutil.rmtree(filepath)
62    else:
63        os.unlink(filepath)
64    return
65
[10677]66
[7186]67def copy_filesystem_tree(src, dst, overwrite=False, del_old=False):
[4188]68    """Copy contents of directory src to directory dst.
69
70    Both directories must exists.
71
72    If `overwrite` is true, any same named objects will be
73    overwritten. Otherwise these files will not be touched.
74
75    If `del_old` is true, copied files and directories will be removed
76    from the src directory.
77
78    This functions returns a list of non-copied files.
79
80    Unix hidden files and directories (starting with '.') are not
81    processed by this function.
82    """
83    if not os.path.exists(src):
84        raise ValueError('source path does not exist: %s' % src)
85    if not os.path.exists(dst):
86        raise ValueError('destination path does not exist: %s' % dst)
87    if not os.path.isdir(src):
88        raise ValueError('source path is not a directory: %s' % src)
89    if not os.path.isdir(dst):
90        raise ValueError('destination path is not a directory: %s' % dst)
91    not_copied = []
92    for item in os.listdir(src):
93        if item.startswith('.'):
[10677]94            continue  # We do not copy hidden stuff...
[4188]95        itemsrc = os.path.join(src, item)
96        itemdst = os.path.join(dst, item)
97
98        if os.path.exists(itemdst):
99            if overwrite is True:
[7186]100                remove_file_or_directory(itemdst)
[4188]101            else:
102                not_copied.append(item)
103                continue
[6113]104
[4188]105        if os.path.isdir(itemsrc):
106            shutil.copytree(itemsrc, itemdst)
107        else:
108            shutil.copy2(itemsrc, itemdst)
109        if del_old:
[7186]110            remove_file_or_directory(itemsrc)
[4188]111    return not_copied
[4375]112
113
[7186]114def get_inner_HTML_part(html_code):
[4375]115    """Return the 'inner' part of a complete HTML snippet.
116
117    If there is a form part, get this.
118
119    If there is no form part, try to return the body part contents.
120
121    If there is no body, return as-is.
[5738]122
123    Let's see how that works. If we deliver some doc with form, we
124    will get that form only:
125
126       >>> doc = '<html><form>My Form</form>Outside the form</html>'
[7186]127       >>> get_inner_HTML_part(doc)
[5738]128       '<form>My Form</form>'
129
130    No form? Then seek for a body part and get the contents:
131
132       >>> doc = '<html><body>My Body</body>Trailing Trash</html>'
[7186]133       >>> get_inner_HTML_part(doc)
[5738]134       'My Body'
135
136    If none of these is included, return what we got:
137
138       >>> doc = '<html>without body nor form</html>'
[7186]139       >>> get_inner_HTML_part(doc)
[5738]140       '<html>without body nor form</html>'
141
[4375]142    """
143
144    try:
[5738]145        result = re.match('^.+(<form[^\>]*>.*</form>).+$', html_code,
[4375]146                          re.DOTALL).groups()[0]
147        return result
148    except AttributeError:
149        # No <form> part included
150        try:
151            result = re.match('^.+<body[^\>]*>(.*)</body>.*$', html_code,
152                              re.DOTALL).groups()[0]
153            return result
154        except AttributeError:
155            # No <form> and no <body> tag...
156            pass
157    return html_code
158
[10677]159
[5731]160class FactoryBase(grok.GlobalUtility):
161    """A factory for things.
162
163    This is a baseclass for easier creation of factories. Factories
164    are utilities that are registered under a certain name and return
165    instances of certain classes when called.
166
[7811]167    In :mod:`waeup.kofa` we use factories extensively for
[7933]168    batching. While processing a batch some processors looks up a
[5731]169    factory to create real-world instances that then get filled with
170    data from imported CSV files.
171
172    To get rid of reimplementing the same stuff over and over again,
173    most notably the methods defined here, we offer this base class
174    (which will *not* be registered as a factory itself).
175
176    Real factories can then be created like this:
177
178       >>> import grok
[7811]179       >>> from waeup.kofa.utils.helpers import FactoryBase
[5731]180       >>> class MyObject(object):
181       ...   # Some class we want to get instances of.
182       ...   pass
183       >>> class MyObjectFactory(FactoryBase):
184       ...   # This is the factory for MyObject instances
[7811]185       ...   grok.name(u'waeup.kofa.factory.MyObject')
[5731]186       ...   factory = MyObject
187
188    That's it. It is essential to set the ``factory`` attribute, which
189    will determine the class of which instances should be created when
190    called. The given name must even be unique amongst all utilities
191    registered during runtime. While you can pick any name you like
[7811]192    you might want to prepend ``waeup.kofa.factory.`` to the name
[5731]193    string to make sure it does not clash with names of other
194    utilities one day.
195
196    Before all this works we have to grok the baseclass once and our
197    freshly defined factory. This executes all the component
198    registration stuff we don't want to do ourselves. In daily use
[7811]199    this is done automatically on startup of a :mod:`waeup.kofa`
[5731]200    system.
[6113]201
[7811]202       >>> grok.testing.grok('waeup.kofa.utils.helpers')
[5731]203       >>> grok.testing.grok_component(
204       ...    'MyObjectFactory', MyObjectFactory
205       ...  )
206       True
207
[7933]208    After grokking we (and processors) can create objects without
[5731]209    knowing about the location of the real class definition, just by
210    the factory name:
211
212       >>> from zope.component import createObject
[7811]213       >>> obj = createObject('waeup.kofa.factory.MyObject')
[5731]214       >>> isinstance(obj, MyObject)
215       True
216
217    We can also use the regular utility lookups to find our new
218    factory:
219
220       >>> from zope.component import getUtility
221       >>> from zope.component.interfaces import IFactory
222       >>> factory = getUtility(
[7811]223       ...   IFactory, name='waeup.kofa.factory.MyObject'
[5731]224       ...   )
225       >>> isinstance(factory, MyObjectFactory)
226       True
227
228    And this factory generates `MyObject` instances:
229
230       >>> obj = factory()
231       >>> isinstance(obj, MyObject)
232       True
233
234    """
[10677]235    grok.baseclass()  # Do not grok this class, do not register us.
[5731]236    grok.implements(IFactory)
237    # You can override any of the following attributes in derived
238    # classes. The `grok.name` setting *must* even be set to some
239    # unique value.
240    grok.name(u'waeup.Factory')
241    title = u"Create instances of ``factory``.",
242    description = u"This factory instantiates new applicant instances."
243    factory = None
244
245    def __call__(self, *args, **kw):
246        """The main factory function.
247
248        Returns an instance of the requested object.
249        """
250        return self.factory()
251
252    def getInterfaces(self):
253        # Required by IFactory
254        return implementedBy(self.factory)
[5848]255
[10677]256
[5848]257def ReST2HTML_w_warnings(source_string):
258    """Convert a reStructuredText string to HTML preserving warnings.
259
260    Returns a tuple ``(<HTML_CODE>, <WARNINGS>)``, both being
261    strings. Where ``<HTML_CODE>`` is the HTML code generated from the
[5876]262    source string (in unicode), ``<WARNINGS>`` is a string containing
263    any warning messages or ``None``.
[6113]264
[5848]265    Regular multi-line ReStructuredText strings will be returned as
266    HTML code:
267
[7811]268        >>> from waeup.kofa.utils.helpers import ReST2HTML
[5848]269        >>> source = '''
270        ... Headline
271        ... ========
272        ...
273        ... - A list item
274        ... - Another item
275        ...
276        ... Thanks for watching!
277        ... '''
278        >>> html, warnings = ReST2HTML_w_warnings(source)
279        >>> print html
280        <div class="document" id="headline">
281        <h1 class="title">Headline</h1>
282        <BLANKLINE>
283        <ul class="simple">
284        <li>A list item</li>
285        <li>Another item</li>
286        </ul>
287        <p>Thanks for watching!</p>
288        </div>
289
290    Here no warnings happened, so the `warnings` are ``None``:
291
292        >>> warnings is None
293        True
[6113]294
[5848]295    If warnings happen then they can be retrieved in the returned
296    ``warnings``. We try to render an erraneous document:
297
298        >>> source = '''
299        ... Headline
300        ... ======
301        ...
302        ... Thanks for watching!
303        ... '''
304        >>> html, warnings = ReST2HTML_w_warnings(source)
305        >>> print html
306        <div class="document" id="headline">
307        <h1 class="title">Headline</h1>
308        <BLANKLINE>
309        <p>Thanks for watching!</p>
310        </div>
311
312        >>> print warnings
313        <string>:3: (WARNING/2) Title underline too short.
314        <BLANKLINE>
315        Headline
316        ======
317        <BLANKLINE>
318
319    As you can see, the warnings are not displayed inline the document
320    but can be retrieved from the returned warnings, which is a string
321    or ``None``.
322    """
323    warnings = StringIO()
324    fulldoc = publish_string(
325        source_string, writer_name='html4css1',
326        settings_overrides={
327            'report_level': 0,
328            'warning_stream': warnings,
329            })
330    warnings.seek(0)
331    warning_msgs = warnings.read()
332    if warning_msgs:
333        # Render again, this time with no warnings inline...
[10677]334        fulldoc = publish_string(
[5848]335        source_string, writer_name='html4css1',
336        settings_overrides={
337            'report_level': 10000,
338            'halt_level': 10000,
339            'warning_stream': warnings,
340            })
341    if warning_msgs == '':
342        warning_msgs = None
[7186]343    result = get_inner_HTML_part(fulldoc).strip()
[5876]344    if not isinstance(result, unicode):
345        result = result.decode('utf-8')
346    return result, warning_msgs
[5848]347
[10677]348
[5848]349def ReST2HTML(source_string):
350    """Render a string containing ReStructuredText to HTML.
351
352    Any warnings about too short headings, etc. are silently
353    discarded. Use :func:`ReST2HTML_w_warnings` if you want to get any
354    warnings.
355
[5876]356    The returned string will be unicode.
[6113]357
[5848]358    A regular document will be rendered like this:
359
360        >>> source = '''
361        ... Headline
362        ... ========
363        ...
364        ... Thanks for watching!
365        ... '''
366        >>> html = ReST2HTML(source)
367        >>> print html
368        <div class="document" id="headline">
369        <h1 class="title">Headline</h1>
370        <BLANKLINE>
371        <p>Thanks for watching!</p>
372        </div>
373
374    A document with markup problems (here: the underline is too short)
375    will look similar:
376
377        >>> source = '''
378        ... Headline
379        ... ======
380        ...
381        ... Thanks for watching!
382        ... '''
383        >>> html = ReST2HTML(source)
384        >>> print html
385        <div class="document" id="headline">
386        <h1 class="title">Headline</h1>
387        <BLANKLINE>
388        <p>Thanks for watching!</p>
389        </div>
[6113]390
[5848]391    """
392    html, warnings = ReST2HTML_w_warnings(source_string)
393    return html
[6071]394
[10677]395
[9689]396def attrs_to_fields(cls, omit=[]):
[12974]397    """Set class attributes and bind them to the data definitions
398    specified in the interface by turning the attributes into FieldProperty
399    instances.
[6113]400
401    With Python >= 2.6 we can even use this function as a class decorator.
[9689]402
403    `omit` is a list of field names that should _not_ be turned into
404    field properties. This is useful for properties and the like.
[6071]405    """
406    iface = list(implementedBy(cls))[0]
407    for field_name in getFieldNames(iface):
[9689]408        if field_name in omit:
409            continue
[11476]410        field_property = FieldProperty(iface[field_name])
411        # Set proper docstring for the API docs.
412        field_property.__doc__ = iface[field_name].title + ' (computed attribute)'
413        setattr(cls, field_name, field_property)
[6071]414    return cls
[6372]415
[10677]416
[6372]417def get_current_principal():
418    """Get the 'current' principal.
419
420    This method works without a request. Examining a request is the
421    regular (and recommended) way to get a principal involved
422    'currently'.
423
424    Use this method only if you really have no access to the current
425    request.
426
427    Returns ``None`` when no principal is involved (for instance
428    during tests).
429    """
430    try:
431        principal = getInteraction().participations[0].principal
432    except NoInteraction:
433        return None
[10677]434    except IndexError:  # No participations present
[6372]435        return None
436    return principal
[6503]437
[10677]438
[6503]439def cmp_files(file_descr1, file_descr2):
440    """Compare two files by their file descriptors.
441
442    Returns ``True`` if both are equal, ``False`` otherwise.
443    """
[6531]444    file_descr1.seek(0)
445    file_descr2.seek(0)
[6503]446    while True:
447        b1 = file_descr1.read(BUFSIZE)
448        b2 = file_descr2.read(BUFSIZE)
449        if b1 != b2:
450            return False
451        if not b1:
452            return True
[7078]453
[10677]454
[7078]455def string_from_bytes(number):
456    """Turn a number into some textual representation.
457
458      Examples:
459
460        >>> string_from_bytes(1)
461        u'1 byte(s)'
462
463        >>> string_from_bytes(1025)
464        u'1 KB'
465
466        >>> string_from_bytes(1.5 * 1024*1024)
467        u'1.50 MB'
468
469        >>> string_from_bytes(673.286 * 1024**3)
470        u'673.29 GB'
471
472    """
473    if number < 1024:
474        return u'%s byte(s)' % (str(number),)
[10677]475    elif number < 1024 ** 2:
[7078]476        return u'%s KB' % (number / 1024,)
[10677]477    elif number < 1024 ** 3:
478        return u'%.2f MB' % (number / 1024 ** 2,)
479    return u'%.2f GB' % (number / 1024 ** 3,)
[7079]480
[10677]481
[7079]482def file_size(file_like_obj):
483    """Determine file size in most effective manner.
484
485    Returns the number of bytes in a file. This function works for
486    both, real files as well as file-like objects like cStringIO based
487    'files'.
488
489    Example:
490
491      >>> from cStringIO import StringIO
492      >>> file_size(StringIO('my file content'))
493      15
494
495    Please note that this function expects the file-like object passed
496    in to be at first reading position (it does no seek(0)) and that
497    when finished the file pointer might be at end of file.
498    """
499    if hasattr(file_like_obj, 'fileno'):
500        return os.fstat(file_like_obj.fileno())[6]
[10677]501    file_like_obj.seek(0, 2)  # seek to last position in file
[7079]502    return file_like_obj.tell()
[7175]503
[10677]504
[7175]505def get_user_account(request):
506    """Return local user account.
507    """
508    principal_id = request.principal.id
[7234]509    authenticator = getUtility(IAuthenticatorPlugin, name='users')
510    account = authenticator.getAccount(principal_id)
[7175]511    return account
[7941]512
[10677]513
[7941]514def iface_names(iface, omit=[], exclude_attribs=True, exclude_methods=True):
515    """Get all attribute names of an interface.
516
517    Searches also base interfaces.
518
519    Names of fields that are pure attributes
520    (i.e. zope.interface.Attribute) or methods are excluded by
521    default.
522
523    Names of typical fields derived from zope.schema are included.
524
525    The `omit` paramter can give a list of names to exclude.
526
527    Returns an unsorted list of strings.
528    """
[8370]529    ifaces = set((iface,))
530    # Collect all interfaces (also bases) recursively
531    while True:
532        ext_ifaces = set(ifaces)
533        for iface in ext_ifaces:
534            ext_ifaces = set.union(ext_ifaces, set(iface.getBases()))
535        if ext_ifaces == ifaces:
536            # No new interfaces found, list complete
537            break
538        ifaces = ext_ifaces
539    # Collect (filtered) names of collected interfaces
[7941]540    result = []
[8370]541    for iface in ifaces:
542        for name, descr in iface.namesAndDescriptions():
543            if name in omit:
544                continue
545            if exclude_attribs and descr.__class__ is Attribute:
546                continue
547            if exclude_methods and isinstance(descr, Method):
548                continue
[9043]549            if name in result:
550                continue
[8370]551            result.append(name)
[7941]552    return result
[7968]553
[10677]554
[7968]555def get_sorted_preferred(tuples_iterable, preferred_list):
556    """Get a list of tuples (<TITLE>,<TOKEN>) with values in
557    `preferred_list` put in front.
558
559    The rest of the tuples iterable is returned in orginal order. This
560    is useful for putting default entries on top of (already sorted)
561    lists of choice values, for instance when sorting countries and
562    their code.
563
564    Sample:
565
566    We have a list of tuples with uppercase 'titles' and lowercase
567    'tokens'. This list is already sorted but we want certain values
568    of this list to show up before other values. For instance we want
569    to see the 'C' entry to come first.
570
571      >>> get_sorted_preferred([('A','a'), ('B','b'), ('C','c')],
572      ...                       ['c'])
573      (('C', 'c'), ('A', 'a'), ('B', 'b'))
574
575    i.e. the entry with 'c' as second value moved to head of result.
576
577    We can also require multiple entries at head of list:
578
579      >>> get_sorted_preferred([('A','a'), ('B','b'), ('C','c')],
580      ...                       ['b', 'c'])
581      (('B', 'b'), ('C', 'c'), ('A', 'a'))
582
583    We required the 'b' entry to come before the 'c' entry and then
584    the rest of the input list. That's what we got.
585
586    The result is returned as a tuple of tuples to keep order of values.
587    """
588    result = [None for x in preferred_list]
589    for title, code in tuples_iterable:
590        if code in preferred_list:
591            index = preferred_list.index(code)
592            result[index] = (title, code)
593        else:
594            result.append((title, code))
595    return tuple(result)
[8185]596
[10677]597
[8185]598def now(tz=None):
599    """Get current datetime in timezone of `tz`.
600
601    If `tz`, a `tzinfo` instance, is None, UTC time is returned.
602
603    `tz` should be a timezone as defined in pytz.
604    """
605    return to_timezone(datetime.datetime.utcnow(), tz=tz)
606
[10677]607
[8185]608def to_timezone(dt, tz=None):
609    """Shift datetime into timezone `tz`.
610
611    If datetime `dt` contains no `tzinfo` (i.e. it is 'naive'), it is
612    assumed to be UTC.
613
614    If no `tz` is given, shift to UTC is performed.
[8192]615
616    If `dt` is not a datetime.datetime, the input value is returned
617    unchanged.
[8185]618    """
[8192]619    if not isinstance(dt, datetime.datetime):
620        return dt
[8185]621    if tz is None:
622        tz = pytz.utc
623    if dt.tzinfo is None:
624        dt = pytz.utc.localize(dt)
625    return tz.normalize(dt.tzinfo.normalize(dt).astimezone(tz))
[8466]626
[10677]627
[11660]628def imghdr_test_fpm(h, f):
629    """FPM fileformat test.
630
631    The `fpm` fileformat is the binary fingerprint data as created by
632    `libfprint`.
633    """
634    if len(h) >= 3 and h[:3] == 'FP1':
635        return 'fpm'
636
637
638#: Add test function in stdlib's imghdr tests.
639imghdr.tests.append(imghdr_test_fpm)
640
641
[8466]642def get_fileformat(path, bytestream=None):
643    """Try to determine the file format of a given media file.
644
645    Although checks done here are not done very thoroughly, they make
646    no assumptions about the filetype by looking at its filename
647    extension or similar. Instead they check header data to comply
648    with common known rules (Magic Words).
649
650    If bytestream is not `None` the `path` is ignored.
651
652    Returns filetype as string (something like ``'jpg'``) if
653    file-format can be recognized, ``None`` else.
654
[11660]655    Tested recognized filetypes currently are `jpg`, `png`, `fpm`, and
656    `pdf`.
[8466]657
658    More filetypes (though untested in waeup.kofa) are automatically
659    recognized because we deploy the stdlib `imghdr` library. See this
660    module's docs for a complete list of filetypes recognized.
661    """
662    if path is None and bytestream is None:
663        return None
664
665    img_type = None
666    if bytestream is not None:
667        img_type = imghdr.what(path, bytestream)
668    else:
669        img_type = imghdr.what(path)
670    for name, replacement in (('jpeg', 'jpg'), ('tiff', 'tif')):
671        if img_type == name:
672            img_type = replacement
673    return img_type
674
[10677]675
[8466]676def check_pdf(bytestream, file):
677    """Tell whether a file or bytestream is a PDF file.
678
679    Works as a test/plugin for the stdlib `imghdr` library.
680    """
681    if file is not None:
682        file.seek(0)
683        bytestream = file.read(4)
684        file.seek(0)
685
686    if bytestream.startswith('%PDF'):
687        return 'pdf'
688    return None
689
690# register check_pdf as header check function with `imghdr`
691if check_pdf not in imghdr.tests:
692    imghdr.tests.append(check_pdf)
[8631]693
[10677]694
[8631]695def merge_csv_files(path1, path2):
696    """Merge two CSV files into one (appending).
697
698    CSV data from `path2` will be merged into `path1` csv file. This
699    is a bit like 'appending' data from path2 to data from path1.
700
701    The path of the resulting temporary file will be returned.
702
703    In the result file data from `path2` will always come _after_ data
704    from `path1`.
705
706    **Caution**: It is the _callers_ responsibility to remove the
707    result file (which is created by tempfile.mkstemp) after usage.
708
709    This CSV file merging copes with different column orders in both
710    CSV files and even with different column sets in both files.
[8633]711
712    Also broken/empty CSV files can be handled.
[8631]713    """
714    # sniff the col names
[8633]715    try:
716        row10 = csv.DictReader(open(path1, 'rb')).next()
717    except StopIteration:
718        row10 = dict()
719    try:
720        row20 = csv.DictReader(open(path2, 'rb')).next()
721    except StopIteration:
722        row20 = dict()
[8631]723    fieldnames = sorted(list(set(row10.keys() + row20.keys())))
724    # now read/write the real data
725    reader1 = csv.DictReader(open(path1, 'rb'))
726    reader2 = csv.DictReader(open(path2, 'rb'))
727    wp, tmp_path = tempfile.mkstemp()
728    writer = csv.DictWriter(os.fdopen(wp, 'wb'), fieldnames)
[10677]729    writer.writerow(dict((x, x) for x in fieldnames))  # header
[8631]730    for row in reader1:
731        writer.writerow(row)
732    for row in reader2:
733        writer.writerow(row)
734    return tmp_path
[9372]735
[10677]736
[9372]737def product(sequence, start=1):
738    """Returns the product of a sequence of numbers (_not_ strings)
739    multiplied by the parameter `start` (defaults to 1). If the
740    sequence is empty, returns 0.
741    """
742    if not len(sequence):
743        return 0
744    result = start
745    for item in sequence:
746        result *= item
747    return result
[9593]748
[10677]749
[9593]750class NullHandler(logging.Handler):
751    """A logging NullHandler.
752
753    Does not log anything. Useful if you want to shut up a log.
754
755    Defined here for backwards compatibility with Python < 2.7.
756    """
757    def emit(self, record):
758        pass
[10676]759
760
761def check_csv_charset(iterable):
[14939]762    """Check contents of `iterable` regarding valid CSV encoding and
763    trailing whitespaces in data.
[10676]764
765    `iterable` is expected to be an iterable on _rows_ (not
766    chars). This is true for instance for
767    filehandlers. `zope.publisher.browser.FileUpload` instances are
768    _not_ iterable, unfortunately.
769
770    Returns line num of first illegal char or ``None``. Line nums
[14939]771    start counting with 1 (not zero). Returns -1 if data contain
772    trailing whitespaces.
[10676]773    """
774    linenum = 1
775    try:
[13537]776        reader = csv.DictReader(iterable)
[10676]777        for row in reader:
778            linenum += 1
[14939]779            for value in row.values():
780                if value.endswith(' '):
781                    return -1
[10676]782    except UnicodeDecodeError:
783        return linenum
784    except:
785        return linenum + 1
786    return None
[11824]787
788
789class MemInfo(dict):
790    """A dict with access to its items like if they are attributes.
791    """
792    __getattr__ = dict.__getitem__
793    __setattr__ = dict.__setitem__
794    __delattr__ = dict.__delitem__
795
796
797def get_meminfo(src="/proc/meminfo"):
798    """Get local memory info as provided in /proc/meminfo.
799
800    Entries in /proc/meminfo are available as MemInfo attributes.
801
802    By default we lookup a file /proc/meminfo. Another path can be
803    lines = open(src, 'r').read()passed in as `src` parameter. In this
804    case `src` must be a regular file and contain meminfo-style data.
805
806    If the given `src` (or `/proc/meminfo`) are not available, `None`
807    lines = open(src, 'r').read()is returned.
808    """
809    if not os.path.isfile(src):
810        return None
811    lines = open(src, 'r').read().splitlines()
812    result = MemInfo()
813    for line in lines:
814        key, value = line.split(':', 1)
815        value = int(value.split(' kB', 1)[0])
816        result[key] = value
817    return result
[12231]818
819def html2dict(value=None,portal_language='en'):
820    """Transforms a localized HTML text string into a dictionary.
821
[13077]822    Different languages must be separated by ``>>xy<<`` whereas
[12231]823    xy is the language code. Text parts without correct leading
824    language separator - usually the first part has no language
825    descriptor - are interpreted as texts in the portal's language.
826    """
827    try:
828        parts = value.split('>>')
829    except:
830        return {}
831    elements = {}
832    lang = portal_language
833    for part in parts:
834        if part[2:4] == u'<<':
[12393]835            lang = str(part[0:2].lower())
[12231]836            text = part[4:]
837            elements[lang] = renderElement(u'div id="html"',
838                contents=text)
839        else:
840            text = part
841            elements[lang] = renderElement(u'div id="html"',
842                contents=text)
[12433]843    return elements
844
845def rest2dict(value=None,portal_language='en'):
846    """Transforms a localized REST text string into a dictionary.
847
[13077]848    Different languages must be separated by ``>>xy<<``` whereas
[12433]849    xy is the language code. Text parts without correct leading
850    language separator - usually the first part has no language
851    descriptor - are interpreted as texts in the portal's language.
852    """
853    try:
854        parts = value.split('>>')
855    except:
856        return {}
857    elements = {}
858    lang = portal_language
859    for part in parts:
860        if part[2:4] == u'<<':
861            lang = str(part[0:2].lower())
862            text = part[4:]
863            elements[lang] = renderElement(u'div id="rest"',
864                contents=ReST2HTML(text))
865        else:
866            text = part
867            elements[lang] = renderElement(u'div id="rest"',
868                contents=ReST2HTML(text))
[15595]869    return elements
870
871
872
873class FormVarParser(HTMLParser):
874    """An HTML form parser that extracts keys and values.
875
876       Fed with an HTML document, we parse all starttags and check for each,
877       whether it provides a `name` and a `value` attribute. If so, the
878       values of the respective attributes are stored in instance var
879       `form_vars` as a dict entry.
880    """
881
882    def __init__(self):
883        HTMLParser.__init__(self)  # old-style class - no super()
884        self.form_vars = {}
885
886    def handle_starttag(self, tag, attrs):
887        tag_attrs = {}
888        for key, val in attrs:
889            tag_attrs[key] = val
890        if 'name' in tag_attrs and 'value' in tag_attrs:
[15597]891            self.form_vars[tag_attrs['name']] = unicode(tag_attrs['value'])
[15595]892
893
894def extract_formvars(html_code):
895    """Extract keys and values from an HTML form as dict.
896
897       No text, no values::
898
899         >>> extract_formvars("")
900         {}
901
902       Simple input tags normally provide name and value::
903
904         >>> extract_formvars("<input type='text' name='foo' value='bar'>")
[15627]905         {'foo': u'bar'}
[15595]906
907       The sample doc we stored in tests is a bit more difficult::
908
909         >>> html_path = os.path.join(os.path.dirname(__file__),
910         ...                          'tests', 'sample_response.html')
911         >>> html_code = open(html_path, 'r').read()
912         >>> import pprint
913         >>> pprint.pprint(extract_formvars(html_code))
[15627]914         {'AMOUNT': u'100',
[15595]915         ...
[15627]916          'TRANS_NUM': u'01ESA20190916134824YA3YJ8'}
[15595]917
918    """
919    result = {}
920    parser = FormVarParser()
921    parser.feed(html_code)
922    return parser.form_vars
[15739]923
924
925def get_catalog_docids(cat):
926    """Get all docids for a given catalog `cat`.
927
928    Catalogs store the ids of objects they index. Get all of these object ids.
929    This function works at least for catalogs that provide field- and text
930    indexes only.
931    """
932    result = []
933    for index in cat.values():
934        try:
935            # FieldIndexes
936            result.extend(list(index._rev_index.keys()))
937        except AttributeError:
938            # TextIndexes
939            result.extend(list(index.index._docwords.keys()))
940    return set(result)
941
942
943def reindex_cat(cat):
944    """Reindex all objects stored in a catalog `cat`.
945
946    Regular catalogs try to reindex all stored object ids of a ZODB when asked
947    to reindex all contents. That can be overkill. This function reindexes only
948    those objects, that were already stored in a catalog. It was tested for
949    catalogs with at least 650000 objects.
950
951    Please note, that reindexing catalgos, can take a considerable amount of
952    time. 100.000 objects took about 12 minutes to reindex on a 16 GB machine.
953    """
954    d1 = datetime.datetime.now()
955    print("Collecting doc ids...")
956    uidutil = getUtility(IIntIds, context=cat)
957    uids = get_catalog_docids(cat)
958    print("Found %s entries..." % len(uids))
[15748]959    for n, docid in enumerate(uids):
[15739]960        ob = uidutil.getObject(docid)
961        cat.index_doc(docid, ob)
[15748]962        # indexes can become huge. commit changes every 5000th round to
963        # keep the memory footprint of catalogs `updateIndex` manageable
964        if not n % 5000:
965            transaction.commit()
[15739]966    d2 = datetime.datetime.now()
967    print("Finished. %s" % (d2 - d1))
Note: See TracBrowser for help on using the repository browser.