source: main/waeup.kofa/trunk/src/waeup/kofa/utils/helpers.py @ 10530

Last change on this file since 10530 was 10028, checked in by uli, 12 years ago

Further updates of CSV-related places.

  • Property svn:keywords set to Id
File size: 22.4 KB
Line 
1## $Id: helpers.py 10028 2013-03-15 01:12:42Z uli $
2##
3## Copyright (C) 2011 Uli Fouquet & Henrik Bettermann
4## This program is free software; you can redistribute it and/or modify
5## it under the terms of the GNU General Public License as published by
6## the Free Software Foundation; either version 2 of the License, or
7## (at your option) any later version.
8##
9## This program is distributed in the hope that it will be useful,
10## but WITHOUT ANY WARRANTY; without even the implied warranty of
11## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12## GNU General Public License for more details.
13##
14## You should have received a copy of the GNU General Public License
15## along with this program; if not, write to the Free Software
16## Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17##
18"""General helper functions for Kofa.
19"""
20import unicodecsv as csv # XXX: csv ops should move to dedicated module.
21import datetime
22import imghdr
23import logging
24import os
25import pytz
26import re
27import shutil
28import tempfile
29import grok
30from cStringIO import StringIO
31from docutils.core import publish_string
32from zope.component import getUtility
33from zope.component.interfaces import IFactory
34from zope.interface import implementedBy
35from zope.interface.interface import Method, Attribute
36from zope.schema import getFieldNames
37from zope.schema.fieldproperty import FieldProperty
38from zope.security.interfaces import NoInteraction
39from zope.security.management import getInteraction
40from zope.pluggableauth.interfaces import IAuthenticatorPlugin
41from waeup.kofa.interfaces import MessageFactory as _
42
43BUFSIZE = 8 * 1024
44
45def remove_file_or_directory(filepath):
46    """Remove a file or directory.
47
48    Different to :func:`shutil.rmtree` we also accept not existing
49    paths (returning silently) and if a dir turns out to be a regular
50    file, we remove that.
51    """
52    filepath = os.path.abspath(filepath)
53    if not os.path.exists(filepath):
54        return
55    if os.path.isdir(filepath):
56        shutil.rmtree(filepath)
57    else:
58        os.unlink(filepath)
59    return
60
61def copy_filesystem_tree(src, dst, overwrite=False, del_old=False):
62    """Copy contents of directory src to directory dst.
63
64    Both directories must exists.
65
66    If `overwrite` is true, any same named objects will be
67    overwritten. Otherwise these files will not be touched.
68
69    If `del_old` is true, copied files and directories will be removed
70    from the src directory.
71
72    This functions returns a list of non-copied files.
73
74    Unix hidden files and directories (starting with '.') are not
75    processed by this function.
76    """
77    if not os.path.exists(src):
78        raise ValueError('source path does not exist: %s' % src)
79    if not os.path.exists(dst):
80        raise ValueError('destination path does not exist: %s' % dst)
81    if not os.path.isdir(src):
82        raise ValueError('source path is not a directory: %s' % src)
83    if not os.path.isdir(dst):
84        raise ValueError('destination path is not a directory: %s' % dst)
85    not_copied = []
86    for item in os.listdir(src):
87        if item.startswith('.'):
88            continue # We do not copy hidden stuff...
89        itemsrc = os.path.join(src, item)
90        itemdst = os.path.join(dst, item)
91
92        if os.path.exists(itemdst):
93            if overwrite is True:
94                remove_file_or_directory(itemdst)
95            else:
96                not_copied.append(item)
97                continue
98
99        if os.path.isdir(itemsrc):
100            shutil.copytree(itemsrc, itemdst)
101        else:
102            shutil.copy2(itemsrc, itemdst)
103        if del_old:
104            remove_file_or_directory(itemsrc)
105    return not_copied
106
107
108def get_inner_HTML_part(html_code):
109    """Return the 'inner' part of a complete HTML snippet.
110
111    If there is a form part, get this.
112
113    If there is no form part, try to return the body part contents.
114
115    If there is no body, return as-is.
116
117    Let's see how that works. If we deliver some doc with form, we
118    will get that form only:
119
120       >>> doc = '<html><form>My Form</form>Outside the form</html>'
121       >>> get_inner_HTML_part(doc)
122       '<form>My Form</form>'
123
124    No form? Then seek for a body part and get the contents:
125
126       >>> doc = '<html><body>My Body</body>Trailing Trash</html>'
127       >>> get_inner_HTML_part(doc)
128       'My Body'
129
130    If none of these is included, return what we got:
131
132       >>> doc = '<html>without body nor form</html>'
133       >>> get_inner_HTML_part(doc)
134       '<html>without body nor form</html>'
135
136    """
137
138    try:
139        result = re.match('^.+(<form[^\>]*>.*</form>).+$', html_code,
140                          re.DOTALL).groups()[0]
141        return result
142    except AttributeError:
143        # No <form> part included
144        try:
145            result = re.match('^.+<body[^\>]*>(.*)</body>.*$', html_code,
146                              re.DOTALL).groups()[0]
147            return result
148        except AttributeError:
149            # No <form> and no <body> tag...
150            pass
151    return html_code
152
153class FactoryBase(grok.GlobalUtility):
154    """A factory for things.
155
156    This is a baseclass for easier creation of factories. Factories
157    are utilities that are registered under a certain name and return
158    instances of certain classes when called.
159
160    In :mod:`waeup.kofa` we use factories extensively for
161    batching. While processing a batch some processors looks up a
162    factory to create real-world instances that then get filled with
163    data from imported CSV files.
164
165    To get rid of reimplementing the same stuff over and over again,
166    most notably the methods defined here, we offer this base class
167    (which will *not* be registered as a factory itself).
168
169    Real factories can then be created like this:
170
171       >>> import grok
172       >>> from waeup.kofa.utils.helpers import FactoryBase
173       >>> class MyObject(object):
174       ...   # Some class we want to get instances of.
175       ...   pass
176       >>> class MyObjectFactory(FactoryBase):
177       ...   # This is the factory for MyObject instances
178       ...   grok.name(u'waeup.kofa.factory.MyObject')
179       ...   factory = MyObject
180
181    That's it. It is essential to set the ``factory`` attribute, which
182    will determine the class of which instances should be created when
183    called. The given name must even be unique amongst all utilities
184    registered during runtime. While you can pick any name you like
185    you might want to prepend ``waeup.kofa.factory.`` to the name
186    string to make sure it does not clash with names of other
187    utilities one day.
188
189    Before all this works we have to grok the baseclass once and our
190    freshly defined factory. This executes all the component
191    registration stuff we don't want to do ourselves. In daily use
192    this is done automatically on startup of a :mod:`waeup.kofa`
193    system.
194
195       >>> grok.testing.grok('waeup.kofa.utils.helpers')
196       >>> grok.testing.grok_component(
197       ...    'MyObjectFactory', MyObjectFactory
198       ...  )
199       True
200
201    After grokking we (and processors) can create objects without
202    knowing about the location of the real class definition, just by
203    the factory name:
204
205       >>> from zope.component import createObject
206       >>> obj = createObject('waeup.kofa.factory.MyObject')
207       >>> isinstance(obj, MyObject)
208       True
209
210    We can also use the regular utility lookups to find our new
211    factory:
212
213       >>> from zope.component import getUtility
214       >>> from zope.component.interfaces import IFactory
215       >>> factory = getUtility(
216       ...   IFactory, name='waeup.kofa.factory.MyObject'
217       ...   )
218       >>> isinstance(factory, MyObjectFactory)
219       True
220
221    And this factory generates `MyObject` instances:
222
223       >>> obj = factory()
224       >>> isinstance(obj, MyObject)
225       True
226
227    """
228    grok.baseclass() # Do not grok this class, do not register us.
229    grok.implements(IFactory)
230    # You can override any of the following attributes in derived
231    # classes. The `grok.name` setting *must* even be set to some
232    # unique value.
233    grok.name(u'waeup.Factory')
234    title = u"Create instances of ``factory``.",
235    description = u"This factory instantiates new applicant instances."
236    factory = None
237
238    def __call__(self, *args, **kw):
239        """The main factory function.
240
241        Returns an instance of the requested object.
242        """
243        return self.factory()
244
245    def getInterfaces(self):
246        # Required by IFactory
247        return implementedBy(self.factory)
248
249def ReST2HTML_w_warnings(source_string):
250    """Convert a reStructuredText string to HTML preserving warnings.
251
252    Returns a tuple ``(<HTML_CODE>, <WARNINGS>)``, both being
253    strings. Where ``<HTML_CODE>`` is the HTML code generated from the
254    source string (in unicode), ``<WARNINGS>`` is a string containing
255    any warning messages or ``None``.
256
257    Regular multi-line ReStructuredText strings will be returned as
258    HTML code:
259
260        >>> from waeup.kofa.utils.helpers import ReST2HTML
261        >>> source = '''
262        ... Headline
263        ... ========
264        ...
265        ... - A list item
266        ... - Another item
267        ...
268        ... Thanks for watching!
269        ... '''
270        >>> html, warnings = ReST2HTML_w_warnings(source)
271        >>> print html
272        <div class="document" id="headline">
273        <h1 class="title">Headline</h1>
274        <BLANKLINE>
275        <ul class="simple">
276        <li>A list item</li>
277        <li>Another item</li>
278        </ul>
279        <p>Thanks for watching!</p>
280        </div>
281
282    Here no warnings happened, so the `warnings` are ``None``:
283
284        >>> warnings is None
285        True
286
287    If warnings happen then they can be retrieved in the returned
288    ``warnings``. We try to render an erraneous document:
289
290        >>> source = '''
291        ... Headline
292        ... ======
293        ...
294        ... Thanks for watching!
295        ... '''
296        >>> html, warnings = ReST2HTML_w_warnings(source)
297        >>> print html
298        <div class="document" id="headline">
299        <h1 class="title">Headline</h1>
300        <BLANKLINE>
301        <p>Thanks for watching!</p>
302        </div>
303
304        >>> print warnings
305        <string>:3: (WARNING/2) Title underline too short.
306        <BLANKLINE>
307        Headline
308        ======
309        <BLANKLINE>
310
311    As you can see, the warnings are not displayed inline the document
312    but can be retrieved from the returned warnings, which is a string
313    or ``None``.
314    """
315    warnings = StringIO()
316    fulldoc = publish_string(
317        source_string, writer_name='html4css1',
318        settings_overrides={
319            'report_level': 0,
320            'warning_stream': warnings,
321            })
322    warnings.seek(0)
323    warning_msgs = warnings.read()
324    if warning_msgs:
325        # Render again, this time with no warnings inline...
326        fulldoc =  publish_string(
327        source_string, writer_name='html4css1',
328        settings_overrides={
329            'report_level': 10000,
330            'halt_level': 10000,
331            'warning_stream': warnings,
332            })
333    if warning_msgs == '':
334        warning_msgs = None
335    result = get_inner_HTML_part(fulldoc).strip()
336    if not isinstance(result, unicode):
337        result = result.decode('utf-8')
338    return result, warning_msgs
339
340def ReST2HTML(source_string):
341    """Render a string containing ReStructuredText to HTML.
342
343    Any warnings about too short headings, etc. are silently
344    discarded. Use :func:`ReST2HTML_w_warnings` if you want to get any
345    warnings.
346
347    The returned string will be unicode.
348
349    A regular document will be rendered like this:
350
351        >>> source = '''
352        ... Headline
353        ... ========
354        ...
355        ... Thanks for watching!
356        ... '''
357        >>> html = ReST2HTML(source)
358        >>> print html
359        <div class="document" id="headline">
360        <h1 class="title">Headline</h1>
361        <BLANKLINE>
362        <p>Thanks for watching!</p>
363        </div>
364
365    A document with markup problems (here: the underline is too short)
366    will look similar:
367
368        >>> source = '''
369        ... Headline
370        ... ======
371        ...
372        ... Thanks for watching!
373        ... '''
374        >>> html = ReST2HTML(source)
375        >>> print html
376        <div class="document" id="headline">
377        <h1 class="title">Headline</h1>
378        <BLANKLINE>
379        <p>Thanks for watching!</p>
380        </div>
381
382    """
383    html, warnings = ReST2HTML_w_warnings(source_string)
384    return html
385
386def attrs_to_fields(cls, omit=[]):
387    """Turn the attributes of a class into FieldProperty instances.
388
389    With Python >= 2.6 we can even use this function as a class decorator.
390
391    `omit` is a list of field names that should _not_ be turned into
392    field properties. This is useful for properties and the like.
393    """
394    iface = list(implementedBy(cls))[0]
395    for field_name in getFieldNames(iface):
396        if field_name in omit:
397            continue
398        setattr(cls, field_name, FieldProperty(iface[field_name]))
399    return cls
400
401def get_current_principal():
402    """Get the 'current' principal.
403
404    This method works without a request. Examining a request is the
405    regular (and recommended) way to get a principal involved
406    'currently'.
407
408    Use this method only if you really have no access to the current
409    request.
410
411    Returns ``None`` when no principal is involved (for instance
412    during tests).
413    """
414    try:
415        principal = getInteraction().participations[0].principal
416    except NoInteraction:
417        return None
418    except IndexError: # No participations present
419        return None
420    return principal
421
422def cmp_files(file_descr1, file_descr2):
423    """Compare two files by their file descriptors.
424
425    Returns ``True`` if both are equal, ``False`` otherwise.
426    """
427    file_descr1.seek(0)
428    file_descr2.seek(0)
429    while True:
430        b1 = file_descr1.read(BUFSIZE)
431        b2 = file_descr2.read(BUFSIZE)
432        if b1 != b2:
433            return False
434        if not b1:
435            return True
436
437def string_from_bytes(number):
438    """Turn a number into some textual representation.
439
440      Examples:
441
442        >>> string_from_bytes(1)
443        u'1 byte(s)'
444
445        >>> string_from_bytes(1025)
446        u'1 KB'
447
448        >>> string_from_bytes(1.5 * 1024*1024)
449        u'1.50 MB'
450
451        >>> string_from_bytes(673.286 * 1024**3)
452        u'673.29 GB'
453
454    """
455    if number < 1024:
456        return u'%s byte(s)' % (str(number),)
457    elif number < 1024**2:
458        return u'%s KB' % (number / 1024,)
459    elif number < 1024**3:
460        return u'%.2f MB' % (number / 1024**2,)
461    return u'%.2f GB' % (number / 1024**3,)
462
463def file_size(file_like_obj):
464    """Determine file size in most effective manner.
465
466    Returns the number of bytes in a file. This function works for
467    both, real files as well as file-like objects like cStringIO based
468    'files'.
469
470    Example:
471
472      >>> from cStringIO import StringIO
473      >>> file_size(StringIO('my file content'))
474      15
475
476    Please note that this function expects the file-like object passed
477    in to be at first reading position (it does no seek(0)) and that
478    when finished the file pointer might be at end of file.
479    """
480    if hasattr(file_like_obj, 'fileno'):
481        return os.fstat(file_like_obj.fileno())[6]
482    file_like_obj.seek(0, 2) # seek to last position in file
483    return file_like_obj.tell()
484
485def get_user_account(request):
486    """Return local user account.
487    """
488    principal_id = request.principal.id
489    authenticator = getUtility(IAuthenticatorPlugin, name='users')
490    account = authenticator.getAccount(principal_id)
491    return account
492
493def iface_names(iface, omit=[], exclude_attribs=True, exclude_methods=True):
494    """Get all attribute names of an interface.
495
496    Searches also base interfaces.
497
498    Names of fields that are pure attributes
499    (i.e. zope.interface.Attribute) or methods are excluded by
500    default.
501
502    Names of typical fields derived from zope.schema are included.
503
504    The `omit` paramter can give a list of names to exclude.
505
506    Returns an unsorted list of strings.
507    """
508    ifaces = set((iface,))
509    # Collect all interfaces (also bases) recursively
510    while True:
511        ext_ifaces = set(ifaces)
512        for iface in ext_ifaces:
513            ext_ifaces = set.union(ext_ifaces, set(iface.getBases()))
514        if ext_ifaces == ifaces:
515            # No new interfaces found, list complete
516            break
517        ifaces = ext_ifaces
518    # Collect (filtered) names of collected interfaces
519    result = []
520    for iface in ifaces:
521        for name, descr in iface.namesAndDescriptions():
522            if name in omit:
523                continue
524            if exclude_attribs and descr.__class__ is Attribute:
525                continue
526            if exclude_methods and isinstance(descr, Method):
527                continue
528            if name in result:
529                continue
530            result.append(name)
531    return result
532
533def get_sorted_preferred(tuples_iterable, preferred_list):
534    """Get a list of tuples (<TITLE>,<TOKEN>) with values in
535    `preferred_list` put in front.
536
537    The rest of the tuples iterable is returned in orginal order. This
538    is useful for putting default entries on top of (already sorted)
539    lists of choice values, for instance when sorting countries and
540    their code.
541
542    Sample:
543
544    We have a list of tuples with uppercase 'titles' and lowercase
545    'tokens'. This list is already sorted but we want certain values
546    of this list to show up before other values. For instance we want
547    to see the 'C' entry to come first.
548
549      >>> get_sorted_preferred([('A','a'), ('B','b'), ('C','c')],
550      ...                       ['c'])
551      (('C', 'c'), ('A', 'a'), ('B', 'b'))
552
553    i.e. the entry with 'c' as second value moved to head of result.
554
555    We can also require multiple entries at head of list:
556
557      >>> get_sorted_preferred([('A','a'), ('B','b'), ('C','c')],
558      ...                       ['b', 'c'])
559      (('B', 'b'), ('C', 'c'), ('A', 'a'))
560
561    We required the 'b' entry to come before the 'c' entry and then
562    the rest of the input list. That's what we got.
563
564    The result is returned as a tuple of tuples to keep order of values.
565    """
566    result = [None for x in preferred_list]
567    for title, code in tuples_iterable:
568        if code in preferred_list:
569            index = preferred_list.index(code)
570            result[index] = (title, code)
571        else:
572            result.append((title, code))
573    return tuple(result)
574
575def now(tz=None):
576    """Get current datetime in timezone of `tz`.
577
578    If `tz`, a `tzinfo` instance, is None, UTC time is returned.
579
580    `tz` should be a timezone as defined in pytz.
581    """
582    return to_timezone(datetime.datetime.utcnow(), tz=tz)
583
584def to_timezone(dt, tz=None):
585    """Shift datetime into timezone `tz`.
586
587    If datetime `dt` contains no `tzinfo` (i.e. it is 'naive'), it is
588    assumed to be UTC.
589
590    If no `tz` is given, shift to UTC is performed.
591
592    If `dt` is not a datetime.datetime, the input value is returned
593    unchanged.
594    """
595    if not isinstance(dt, datetime.datetime):
596        return dt
597    if tz is None:
598        tz = pytz.utc
599    if dt.tzinfo is None:
600        dt = pytz.utc.localize(dt)
601    return tz.normalize(dt.tzinfo.normalize(dt).astimezone(tz))
602
603def get_fileformat(path, bytestream=None):
604    """Try to determine the file format of a given media file.
605
606    Although checks done here are not done very thoroughly, they make
607    no assumptions about the filetype by looking at its filename
608    extension or similar. Instead they check header data to comply
609    with common known rules (Magic Words).
610
611    If bytestream is not `None` the `path` is ignored.
612
613    Returns filetype as string (something like ``'jpg'``) if
614    file-format can be recognized, ``None`` else.
615
616    Tested recognized filetypes currently are `jpg`, `png`, and `pdf`.
617
618    More filetypes (though untested in waeup.kofa) are automatically
619    recognized because we deploy the stdlib `imghdr` library. See this
620    module's docs for a complete list of filetypes recognized.
621    """
622    if path is None and bytestream is None:
623        return None
624
625    img_type = None
626    if bytestream is not None:
627        img_type = imghdr.what(path, bytestream)
628    else:
629        img_type = imghdr.what(path)
630    for name, replacement in (('jpeg', 'jpg'), ('tiff', 'tif')):
631        if img_type == name:
632            img_type = replacement
633    return img_type
634
635def check_pdf(bytestream, file):
636    """Tell whether a file or bytestream is a PDF file.
637
638    Works as a test/plugin for the stdlib `imghdr` library.
639    """
640    if file is not None:
641        file.seek(0)
642        bytestream = file.read(4)
643        file.seek(0)
644
645    if bytestream.startswith('%PDF'):
646        return 'pdf'
647    return None
648
649# register check_pdf as header check function with `imghdr`
650if check_pdf not in imghdr.tests:
651    imghdr.tests.append(check_pdf)
652
653def merge_csv_files(path1, path2):
654    """Merge two CSV files into one (appending).
655
656    CSV data from `path2` will be merged into `path1` csv file. This
657    is a bit like 'appending' data from path2 to data from path1.
658
659    The path of the resulting temporary file will be returned.
660
661    In the result file data from `path2` will always come _after_ data
662    from `path1`.
663
664    **Caution**: It is the _callers_ responsibility to remove the
665    result file (which is created by tempfile.mkstemp) after usage.
666
667    This CSV file merging copes with different column orders in both
668    CSV files and even with different column sets in both files.
669
670    Also broken/empty CSV files can be handled.
671    """
672    # sniff the col names
673    try:
674        row10 = csv.DictReader(open(path1, 'rb')).next()
675    except StopIteration:
676        row10 = dict()
677    try:
678        row20 = csv.DictReader(open(path2, 'rb')).next()
679    except StopIteration:
680        row20 = dict()
681    fieldnames = sorted(list(set(row10.keys() + row20.keys())))
682    # now read/write the real data
683    reader1 = csv.DictReader(open(path1, 'rb'))
684    reader2 = csv.DictReader(open(path2, 'rb'))
685    wp, tmp_path = tempfile.mkstemp()
686    writer = csv.DictWriter(os.fdopen(wp, 'wb'), fieldnames)
687    writer.writerow(dict((x,x) for x in fieldnames)) # header
688    for row in reader1:
689        writer.writerow(row)
690    for row in reader2:
691        writer.writerow(row)
692    return tmp_path
693
694def product(sequence, start=1):
695    """Returns the product of a sequence of numbers (_not_ strings)
696    multiplied by the parameter `start` (defaults to 1). If the
697    sequence is empty, returns 0.
698    """
699    if not len(sequence):
700        return 0
701    result = start
702    for item in sequence:
703        result *= item
704    return result
705
706class NullHandler(logging.Handler):
707    """A logging NullHandler.
708
709    Does not log anything. Useful if you want to shut up a log.
710
711    Defined here for backwards compatibility with Python < 2.7.
712    """
713    def emit(self, record):
714        pass
Note: See TracBrowser for help on using the repository browser.