source: main/waeup.kofa/trunk/src/waeup/kofa/utils/helpers.py @ 8643

Last change on this file since 8643 was 8633, checked in by uli, 13 years ago

Provide the more robust CSV file merging.

  • Property svn:keywords set to Id
File size: 21.5 KB
Line 
1## $Id: helpers.py 8633 2012-06-06 01:11:19Z uli $
2##
3## Copyright (C) 2011 Uli Fouquet & Henrik Bettermann
4## This program is free software; you can redistribute it and/or modify
5## it under the terms of the GNU General Public License as published by
6## the Free Software Foundation; either version 2 of the License, or
7## (at your option) any later version.
8##
9## This program is distributed in the hope that it will be useful,
10## but WITHOUT ANY WARRANTY; without even the implied warranty of
11## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12## GNU General Public License for more details.
13##
14## You should have received a copy of the GNU General Public License
15## along with this program; if not, write to the Free Software
16## Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17##
18"""General helper functions for Kofa.
19"""
20import csv
21import datetime
22import imghdr
23import os
24import pytz
25import re
26import shutil
27import tempfile
28import grok
29from cStringIO import StringIO
30from docutils.core import publish_string
31from zope.component import getUtility
32from zope.component.interfaces import IFactory
33from zope.interface import implementedBy
34from zope.interface.interface import Method, Attribute
35from zope.schema import getFieldNames
36from zope.schema.fieldproperty import FieldProperty
37from zope.security.interfaces import NoInteraction
38from zope.security.management import getInteraction
39from zope.pluggableauth.interfaces import IAuthenticatorPlugin
40
41BUFSIZE = 8 * 1024
42
43def remove_file_or_directory(filepath):
44    """Remove a file or directory.
45
46    Different to :func:`shutil.rmtree` we also accept not existing
47    paths (returning silently) and if a dir turns out to be a regular
48    file, we remove that.
49    """
50    filepath = os.path.abspath(filepath)
51    if not os.path.exists(filepath):
52        return
53    if os.path.isdir(filepath):
54        shutil.rmtree(filepath)
55    else:
56        os.unlink(filepath)
57    return
58
59def copy_filesystem_tree(src, dst, overwrite=False, del_old=False):
60    """Copy contents of directory src to directory dst.
61
62    Both directories must exists.
63
64    If `overwrite` is true, any same named objects will be
65    overwritten. Otherwise these files will not be touched.
66
67    If `del_old` is true, copied files and directories will be removed
68    from the src directory.
69
70    This functions returns a list of non-copied files.
71
72    Unix hidden files and directories (starting with '.') are not
73    processed by this function.
74    """
75    if not os.path.exists(src):
76        raise ValueError('source path does not exist: %s' % src)
77    if not os.path.exists(dst):
78        raise ValueError('destination path does not exist: %s' % dst)
79    if not os.path.isdir(src):
80        raise ValueError('source path is not a directory: %s' % src)
81    if not os.path.isdir(dst):
82        raise ValueError('destination path is not a directory: %s' % dst)
83    not_copied = []
84    for item in os.listdir(src):
85        if item.startswith('.'):
86            continue # We do not copy hidden stuff...
87        itemsrc = os.path.join(src, item)
88        itemdst = os.path.join(dst, item)
89
90        if os.path.exists(itemdst):
91            if overwrite is True:
92                remove_file_or_directory(itemdst)
93            else:
94                not_copied.append(item)
95                continue
96
97        if os.path.isdir(itemsrc):
98            shutil.copytree(itemsrc, itemdst)
99        else:
100            shutil.copy2(itemsrc, itemdst)
101        if del_old:
102            remove_file_or_directory(itemsrc)
103    return not_copied
104
105
106def get_inner_HTML_part(html_code):
107    """Return the 'inner' part of a complete HTML snippet.
108
109    If there is a form part, get this.
110
111    If there is no form part, try to return the body part contents.
112
113    If there is no body, return as-is.
114
115    Let's see how that works. If we deliver some doc with form, we
116    will get that form only:
117
118       >>> doc = '<html><form>My Form</form>Outside the form</html>'
119       >>> get_inner_HTML_part(doc)
120       '<form>My Form</form>'
121
122    No form? Then seek for a body part and get the contents:
123
124       >>> doc = '<html><body>My Body</body>Trailing Trash</html>'
125       >>> get_inner_HTML_part(doc)
126       'My Body'
127
128    If none of these is included, return what we got:
129
130       >>> doc = '<html>without body nor form</html>'
131       >>> get_inner_HTML_part(doc)
132       '<html>without body nor form</html>'
133
134    """
135
136    try:
137        result = re.match('^.+(<form[^\>]*>.*</form>).+$', html_code,
138                          re.DOTALL).groups()[0]
139        return result
140    except AttributeError:
141        # No <form> part included
142        try:
143            result = re.match('^.+<body[^\>]*>(.*)</body>.*$', html_code,
144                              re.DOTALL).groups()[0]
145            return result
146        except AttributeError:
147            # No <form> and no <body> tag...
148            pass
149    return html_code
150
151class FactoryBase(grok.GlobalUtility):
152    """A factory for things.
153
154    This is a baseclass for easier creation of factories. Factories
155    are utilities that are registered under a certain name and return
156    instances of certain classes when called.
157
158    In :mod:`waeup.kofa` we use factories extensively for
159    batching. While processing a batch some processors looks up a
160    factory to create real-world instances that then get filled with
161    data from imported CSV files.
162
163    To get rid of reimplementing the same stuff over and over again,
164    most notably the methods defined here, we offer this base class
165    (which will *not* be registered as a factory itself).
166
167    Real factories can then be created like this:
168
169       >>> import grok
170       >>> from waeup.kofa.utils.helpers import FactoryBase
171       >>> class MyObject(object):
172       ...   # Some class we want to get instances of.
173       ...   pass
174       >>> class MyObjectFactory(FactoryBase):
175       ...   # This is the factory for MyObject instances
176       ...   grok.name(u'waeup.kofa.factory.MyObject')
177       ...   factory = MyObject
178
179    That's it. It is essential to set the ``factory`` attribute, which
180    will determine the class of which instances should be created when
181    called. The given name must even be unique amongst all utilities
182    registered during runtime. While you can pick any name you like
183    you might want to prepend ``waeup.kofa.factory.`` to the name
184    string to make sure it does not clash with names of other
185    utilities one day.
186
187    Before all this works we have to grok the baseclass once and our
188    freshly defined factory. This executes all the component
189    registration stuff we don't want to do ourselves. In daily use
190    this is done automatically on startup of a :mod:`waeup.kofa`
191    system.
192
193       >>> grok.testing.grok('waeup.kofa.utils.helpers')
194       >>> grok.testing.grok_component(
195       ...    'MyObjectFactory', MyObjectFactory
196       ...  )
197       True
198
199    After grokking we (and processors) can create objects without
200    knowing about the location of the real class definition, just by
201    the factory name:
202
203       >>> from zope.component import createObject
204       >>> obj = createObject('waeup.kofa.factory.MyObject')
205       >>> isinstance(obj, MyObject)
206       True
207
208    We can also use the regular utility lookups to find our new
209    factory:
210
211       >>> from zope.component import getUtility
212       >>> from zope.component.interfaces import IFactory
213       >>> factory = getUtility(
214       ...   IFactory, name='waeup.kofa.factory.MyObject'
215       ...   )
216       >>> isinstance(factory, MyObjectFactory)
217       True
218
219    And this factory generates `MyObject` instances:
220
221       >>> obj = factory()
222       >>> isinstance(obj, MyObject)
223       True
224
225    """
226    grok.baseclass() # Do not grok this class, do not register us.
227    grok.implements(IFactory)
228    # You can override any of the following attributes in derived
229    # classes. The `grok.name` setting *must* even be set to some
230    # unique value.
231    grok.name(u'waeup.Factory')
232    title = u"Create instances of ``factory``.",
233    description = u"This factory instantiates new applicant instances."
234    factory = None
235
236    def __call__(self, *args, **kw):
237        """The main factory function.
238
239        Returns an instance of the requested object.
240        """
241        return self.factory()
242
243    def getInterfaces(self):
244        # Required by IFactory
245        return implementedBy(self.factory)
246
247def ReST2HTML_w_warnings(source_string):
248    """Convert a reStructuredText string to HTML preserving warnings.
249
250    Returns a tuple ``(<HTML_CODE>, <WARNINGS>)``, both being
251    strings. Where ``<HTML_CODE>`` is the HTML code generated from the
252    source string (in unicode), ``<WARNINGS>`` is a string containing
253    any warning messages or ``None``.
254
255    Regular multi-line ReStructuredText strings will be returned as
256    HTML code:
257
258        >>> from waeup.kofa.utils.helpers import ReST2HTML
259        >>> source = '''
260        ... Headline
261        ... ========
262        ...
263        ... - A list item
264        ... - Another item
265        ...
266        ... Thanks for watching!
267        ... '''
268        >>> html, warnings = ReST2HTML_w_warnings(source)
269        >>> print html
270        <div class="document" id="headline">
271        <h1 class="title">Headline</h1>
272        <BLANKLINE>
273        <ul class="simple">
274        <li>A list item</li>
275        <li>Another item</li>
276        </ul>
277        <p>Thanks for watching!</p>
278        </div>
279
280    Here no warnings happened, so the `warnings` are ``None``:
281
282        >>> warnings is None
283        True
284
285    If warnings happen then they can be retrieved in the returned
286    ``warnings``. We try to render an erraneous document:
287
288        >>> source = '''
289        ... Headline
290        ... ======
291        ...
292        ... Thanks for watching!
293        ... '''
294        >>> html, warnings = ReST2HTML_w_warnings(source)
295        >>> print html
296        <div class="document" id="headline">
297        <h1 class="title">Headline</h1>
298        <BLANKLINE>
299        <p>Thanks for watching!</p>
300        </div>
301
302        >>> print warnings
303        <string>:3: (WARNING/2) Title underline too short.
304        <BLANKLINE>
305        Headline
306        ======
307        <BLANKLINE>
308
309    As you can see, the warnings are not displayed inline the document
310    but can be retrieved from the returned warnings, which is a string
311    or ``None``.
312    """
313    warnings = StringIO()
314    fulldoc = publish_string(
315        source_string, writer_name='html4css1',
316        settings_overrides={
317            'report_level': 0,
318            'warning_stream': warnings,
319            })
320    warnings.seek(0)
321    warning_msgs = warnings.read()
322    if warning_msgs:
323        # Render again, this time with no warnings inline...
324        fulldoc =  publish_string(
325        source_string, writer_name='html4css1',
326        settings_overrides={
327            'report_level': 10000,
328            'halt_level': 10000,
329            'warning_stream': warnings,
330            })
331    if warning_msgs == '':
332        warning_msgs = None
333    result = get_inner_HTML_part(fulldoc).strip()
334    if not isinstance(result, unicode):
335        result = result.decode('utf-8')
336    return result, warning_msgs
337
338def ReST2HTML(source_string):
339    """Render a string containing ReStructuredText to HTML.
340
341    Any warnings about too short headings, etc. are silently
342    discarded. Use :func:`ReST2HTML_w_warnings` if you want to get any
343    warnings.
344
345    The returned string will be unicode.
346
347    A regular document will be rendered like this:
348
349        >>> source = '''
350        ... Headline
351        ... ========
352        ...
353        ... Thanks for watching!
354        ... '''
355        >>> html = ReST2HTML(source)
356        >>> print html
357        <div class="document" id="headline">
358        <h1 class="title">Headline</h1>
359        <BLANKLINE>
360        <p>Thanks for watching!</p>
361        </div>
362
363    A document with markup problems (here: the underline is too short)
364    will look similar:
365
366        >>> source = '''
367        ... Headline
368        ... ======
369        ...
370        ... Thanks for watching!
371        ... '''
372        >>> html = ReST2HTML(source)
373        >>> print html
374        <div class="document" id="headline">
375        <h1 class="title">Headline</h1>
376        <BLANKLINE>
377        <p>Thanks for watching!</p>
378        </div>
379
380    """
381    html, warnings = ReST2HTML_w_warnings(source_string)
382    return html
383
384def attrs_to_fields(cls):
385    """Turn the attributes of a class into FieldProperty instances.
386
387    With Python >= 2.6 we can even use this function as a class decorator.
388    """
389    iface = list(implementedBy(cls))[0]
390    for field_name in getFieldNames(iface):
391        setattr(cls, field_name, FieldProperty(iface[field_name]))
392    return cls
393
394def get_current_principal():
395    """Get the 'current' principal.
396
397    This method works without a request. Examining a request is the
398    regular (and recommended) way to get a principal involved
399    'currently'.
400
401    Use this method only if you really have no access to the current
402    request.
403
404    Returns ``None`` when no principal is involved (for instance
405    during tests).
406    """
407    try:
408        principal = getInteraction().participations[0].principal
409    except NoInteraction:
410        return None
411    except IndexError: # No participations present
412        return None
413    return principal
414
415def cmp_files(file_descr1, file_descr2):
416    """Compare two files by their file descriptors.
417
418    Returns ``True`` if both are equal, ``False`` otherwise.
419    """
420    file_descr1.seek(0)
421    file_descr2.seek(0)
422    while True:
423        b1 = file_descr1.read(BUFSIZE)
424        b2 = file_descr2.read(BUFSIZE)
425        if b1 != b2:
426            return False
427        if not b1:
428            return True
429
430def string_from_bytes(number):
431    """Turn a number into some textual representation.
432
433      Examples:
434
435        >>> string_from_bytes(1)
436        u'1 byte(s)'
437
438        >>> string_from_bytes(1025)
439        u'1 KB'
440
441        >>> string_from_bytes(1.5 * 1024*1024)
442        u'1.50 MB'
443
444        >>> string_from_bytes(673.286 * 1024**3)
445        u'673.29 GB'
446
447    """
448    if number < 1024:
449        return u'%s byte(s)' % (str(number),)
450    elif number < 1024**2:
451        return u'%s KB' % (number / 1024,)
452    elif number < 1024**3:
453        return u'%.2f MB' % (number / 1024**2,)
454    return u'%.2f GB' % (number / 1024**3,)
455
456def file_size(file_like_obj):
457    """Determine file size in most effective manner.
458
459    Returns the number of bytes in a file. This function works for
460    both, real files as well as file-like objects like cStringIO based
461    'files'.
462
463    Example:
464
465      >>> from cStringIO import StringIO
466      >>> file_size(StringIO('my file content'))
467      15
468
469    Please note that this function expects the file-like object passed
470    in to be at first reading position (it does no seek(0)) and that
471    when finished the file pointer might be at end of file.
472    """
473    if hasattr(file_like_obj, 'fileno'):
474        return os.fstat(file_like_obj.fileno())[6]
475    file_like_obj.seek(0, 2) # seek to last position in file
476    return file_like_obj.tell()
477
478def get_user_account(request):
479    """Return local user account.
480    """
481    principal_id = request.principal.id
482    authenticator = getUtility(IAuthenticatorPlugin, name='users')
483    account = authenticator.getAccount(principal_id)
484    return account
485
486def iface_names(iface, omit=[], exclude_attribs=True, exclude_methods=True):
487    """Get all attribute names of an interface.
488
489    Searches also base interfaces.
490
491    Names of fields that are pure attributes
492    (i.e. zope.interface.Attribute) or methods are excluded by
493    default.
494
495    Names of typical fields derived from zope.schema are included.
496
497    The `omit` paramter can give a list of names to exclude.
498
499    Returns an unsorted list of strings.
500    """
501    ifaces = set((iface,))
502    # Collect all interfaces (also bases) recursively
503    while True:
504        ext_ifaces = set(ifaces)
505        for iface in ext_ifaces:
506            ext_ifaces = set.union(ext_ifaces, set(iface.getBases()))
507        if ext_ifaces == ifaces:
508            # No new interfaces found, list complete
509            break
510        ifaces = ext_ifaces
511    # Collect (filtered) names of collected interfaces
512    result = []
513    for iface in ifaces:
514        for name, descr in iface.namesAndDescriptions():
515            if name in omit:
516                continue
517            if exclude_attribs and descr.__class__ is Attribute:
518                continue
519            if exclude_methods and isinstance(descr, Method):
520                continue
521            result.append(name)
522    return result
523
524def get_sorted_preferred(tuples_iterable, preferred_list):
525    """Get a list of tuples (<TITLE>,<TOKEN>) with values in
526    `preferred_list` put in front.
527
528    The rest of the tuples iterable is returned in orginal order. This
529    is useful for putting default entries on top of (already sorted)
530    lists of choice values, for instance when sorting countries and
531    their code.
532
533    Sample:
534
535    We have a list of tuples with uppercase 'titles' and lowercase
536    'tokens'. This list is already sorted but we want certain values
537    of this list to show up before other values. For instance we want
538    to see the 'C' entry to come first.
539
540      >>> get_sorted_preferred([('A','a'), ('B','b'), ('C','c')],
541      ...                       ['c'])
542      (('C', 'c'), ('A', 'a'), ('B', 'b'))
543
544    i.e. the entry with 'c' as second value moved to head of result.
545
546    We can also require multiple entries at head of list:
547
548      >>> get_sorted_preferred([('A','a'), ('B','b'), ('C','c')],
549      ...                       ['b', 'c'])
550      (('B', 'b'), ('C', 'c'), ('A', 'a'))
551
552    We required the 'b' entry to come before the 'c' entry and then
553    the rest of the input list. That's what we got.
554
555    The result is returned as a tuple of tuples to keep order of values.
556    """
557    result = [None for x in preferred_list]
558    for title, code in tuples_iterable:
559        if code in preferred_list:
560            index = preferred_list.index(code)
561            result[index] = (title, code)
562        else:
563            result.append((title, code))
564    return tuple(result)
565
566def now(tz=None):
567    """Get current datetime in timezone of `tz`.
568
569    If `tz`, a `tzinfo` instance, is None, UTC time is returned.
570
571    `tz` should be a timezone as defined in pytz.
572    """
573    return to_timezone(datetime.datetime.utcnow(), tz=tz)
574
575def to_timezone(dt, tz=None):
576    """Shift datetime into timezone `tz`.
577
578    If datetime `dt` contains no `tzinfo` (i.e. it is 'naive'), it is
579    assumed to be UTC.
580
581    If no `tz` is given, shift to UTC is performed.
582
583    If `dt` is not a datetime.datetime, the input value is returned
584    unchanged.
585    """
586    if not isinstance(dt, datetime.datetime):
587        return dt
588    if tz is None:
589        tz = pytz.utc
590    if dt.tzinfo is None:
591        dt = pytz.utc.localize(dt)
592    return tz.normalize(dt.tzinfo.normalize(dt).astimezone(tz))
593
594def get_fileformat(path, bytestream=None):
595    """Try to determine the file format of a given media file.
596
597    Although checks done here are not done very thoroughly, they make
598    no assumptions about the filetype by looking at its filename
599    extension or similar. Instead they check header data to comply
600    with common known rules (Magic Words).
601
602    If bytestream is not `None` the `path` is ignored.
603
604    Returns filetype as string (something like ``'jpg'``) if
605    file-format can be recognized, ``None`` else.
606
607    Tested recognized filetypes currently are `jpg`, `png`, and `pdf`.
608
609    More filetypes (though untested in waeup.kofa) are automatically
610    recognized because we deploy the stdlib `imghdr` library. See this
611    module's docs for a complete list of filetypes recognized.
612    """
613    if path is None and bytestream is None:
614        return None
615
616    img_type = None
617    if bytestream is not None:
618        img_type = imghdr.what(path, bytestream)
619    else:
620        img_type = imghdr.what(path)
621    for name, replacement in (('jpeg', 'jpg'), ('tiff', 'tif')):
622        if img_type == name:
623            img_type = replacement
624    return img_type
625
626def check_pdf(bytestream, file):
627    """Tell whether a file or bytestream is a PDF file.
628
629    Works as a test/plugin for the stdlib `imghdr` library.
630    """
631    if file is not None:
632        file.seek(0)
633        bytestream = file.read(4)
634        file.seek(0)
635
636    if bytestream.startswith('%PDF'):
637        return 'pdf'
638    return None
639
640# register check_pdf as header check function with `imghdr`
641if check_pdf not in imghdr.tests:
642    imghdr.tests.append(check_pdf)
643
644def merge_csv_files(path1, path2):
645    """Merge two CSV files into one (appending).
646
647    CSV data from `path2` will be merged into `path1` csv file. This
648    is a bit like 'appending' data from path2 to data from path1.
649
650    The path of the resulting temporary file will be returned.
651
652    In the result file data from `path2` will always come _after_ data
653    from `path1`.
654
655    **Caution**: It is the _callers_ responsibility to remove the
656    result file (which is created by tempfile.mkstemp) after usage.
657
658    This CSV file merging copes with different column orders in both
659    CSV files and even with different column sets in both files.
660
661    Also broken/empty CSV files can be handled.
662    """
663    # sniff the col names
664    try:
665        row10 = csv.DictReader(open(path1, 'rb')).next()
666    except StopIteration:
667        row10 = dict()
668    try:
669        row20 = csv.DictReader(open(path2, 'rb')).next()
670    except StopIteration:
671        row20 = dict()
672    fieldnames = sorted(list(set(row10.keys() + row20.keys())))
673    # now read/write the real data
674    reader1 = csv.DictReader(open(path1, 'rb'))
675    reader2 = csv.DictReader(open(path2, 'rb'))
676    wp, tmp_path = tempfile.mkstemp()
677    writer = csv.DictWriter(os.fdopen(wp, 'wb'), fieldnames)
678    writer.writerow(dict((x,x) for x in fieldnames)) # header
679    for row in reader1:
680        writer.writerow(row)
681    for row in reader2:
682        writer.writerow(row)
683    return tmp_path
Note: See TracBrowser for help on using the repository browser.