source: main/waeup.kofa/backup-trunk/src/waeup/kofa/utils/helpers.py @ 9591

Last change on this file since 9591 was 9043, checked in by Henrik Bettermann, 12 years ago

Do not export repeated fields twice.

  • Property svn:keywords set to Id
File size: 21.6 KB
Line 
1## $Id: helpers.py 9043 2012-07-23 21:08:27Z henrik $
2##
3## Copyright (C) 2011 Uli Fouquet & Henrik Bettermann
4## This program is free software; you can redistribute it and/or modify
5## it under the terms of the GNU General Public License as published by
6## the Free Software Foundation; either version 2 of the License, or
7## (at your option) any later version.
8##
9## This program is distributed in the hope that it will be useful,
10## but WITHOUT ANY WARRANTY; without even the implied warranty of
11## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12## GNU General Public License for more details.
13##
14## You should have received a copy of the GNU General Public License
15## along with this program; if not, write to the Free Software
16## Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17##
18"""General helper functions for Kofa.
19"""
20import csv
21import datetime
22import imghdr
23import os
24import pytz
25import re
26import shutil
27import tempfile
28import grok
29from cStringIO import StringIO
30from docutils.core import publish_string
31from zope.component import getUtility
32from zope.component.interfaces import IFactory
33from zope.interface import implementedBy
34from zope.interface.interface import Method, Attribute
35from zope.schema import getFieldNames
36from zope.schema.fieldproperty import FieldProperty
37from zope.security.interfaces import NoInteraction
38from zope.security.management import getInteraction
39from zope.pluggableauth.interfaces import IAuthenticatorPlugin
40from waeup.kofa.interfaces import MessageFactory as _
41
42BUFSIZE = 8 * 1024
43
44def remove_file_or_directory(filepath):
45    """Remove a file or directory.
46
47    Different to :func:`shutil.rmtree` we also accept not existing
48    paths (returning silently) and if a dir turns out to be a regular
49    file, we remove that.
50    """
51    filepath = os.path.abspath(filepath)
52    if not os.path.exists(filepath):
53        return
54    if os.path.isdir(filepath):
55        shutil.rmtree(filepath)
56    else:
57        os.unlink(filepath)
58    return
59
60def copy_filesystem_tree(src, dst, overwrite=False, del_old=False):
61    """Copy contents of directory src to directory dst.
62
63    Both directories must exists.
64
65    If `overwrite` is true, any same named objects will be
66    overwritten. Otherwise these files will not be touched.
67
68    If `del_old` is true, copied files and directories will be removed
69    from the src directory.
70
71    This functions returns a list of non-copied files.
72
73    Unix hidden files and directories (starting with '.') are not
74    processed by this function.
75    """
76    if not os.path.exists(src):
77        raise ValueError('source path does not exist: %s' % src)
78    if not os.path.exists(dst):
79        raise ValueError('destination path does not exist: %s' % dst)
80    if not os.path.isdir(src):
81        raise ValueError('source path is not a directory: %s' % src)
82    if not os.path.isdir(dst):
83        raise ValueError('destination path is not a directory: %s' % dst)
84    not_copied = []
85    for item in os.listdir(src):
86        if item.startswith('.'):
87            continue # We do not copy hidden stuff...
88        itemsrc = os.path.join(src, item)
89        itemdst = os.path.join(dst, item)
90
91        if os.path.exists(itemdst):
92            if overwrite is True:
93                remove_file_or_directory(itemdst)
94            else:
95                not_copied.append(item)
96                continue
97
98        if os.path.isdir(itemsrc):
99            shutil.copytree(itemsrc, itemdst)
100        else:
101            shutil.copy2(itemsrc, itemdst)
102        if del_old:
103            remove_file_or_directory(itemsrc)
104    return not_copied
105
106
107def get_inner_HTML_part(html_code):
108    """Return the 'inner' part of a complete HTML snippet.
109
110    If there is a form part, get this.
111
112    If there is no form part, try to return the body part contents.
113
114    If there is no body, return as-is.
115
116    Let's see how that works. If we deliver some doc with form, we
117    will get that form only:
118
119       >>> doc = '<html><form>My Form</form>Outside the form</html>'
120       >>> get_inner_HTML_part(doc)
121       '<form>My Form</form>'
122
123    No form? Then seek for a body part and get the contents:
124
125       >>> doc = '<html><body>My Body</body>Trailing Trash</html>'
126       >>> get_inner_HTML_part(doc)
127       'My Body'
128
129    If none of these is included, return what we got:
130
131       >>> doc = '<html>without body nor form</html>'
132       >>> get_inner_HTML_part(doc)
133       '<html>without body nor form</html>'
134
135    """
136
137    try:
138        result = re.match('^.+(<form[^\>]*>.*</form>).+$', html_code,
139                          re.DOTALL).groups()[0]
140        return result
141    except AttributeError:
142        # No <form> part included
143        try:
144            result = re.match('^.+<body[^\>]*>(.*)</body>.*$', html_code,
145                              re.DOTALL).groups()[0]
146            return result
147        except AttributeError:
148            # No <form> and no <body> tag...
149            pass
150    return html_code
151
152class FactoryBase(grok.GlobalUtility):
153    """A factory for things.
154
155    This is a baseclass for easier creation of factories. Factories
156    are utilities that are registered under a certain name and return
157    instances of certain classes when called.
158
159    In :mod:`waeup.kofa` we use factories extensively for
160    batching. While processing a batch some processors looks up a
161    factory to create real-world instances that then get filled with
162    data from imported CSV files.
163
164    To get rid of reimplementing the same stuff over and over again,
165    most notably the methods defined here, we offer this base class
166    (which will *not* be registered as a factory itself).
167
168    Real factories can then be created like this:
169
170       >>> import grok
171       >>> from waeup.kofa.utils.helpers import FactoryBase
172       >>> class MyObject(object):
173       ...   # Some class we want to get instances of.
174       ...   pass
175       >>> class MyObjectFactory(FactoryBase):
176       ...   # This is the factory for MyObject instances
177       ...   grok.name(u'waeup.kofa.factory.MyObject')
178       ...   factory = MyObject
179
180    That's it. It is essential to set the ``factory`` attribute, which
181    will determine the class of which instances should be created when
182    called. The given name must even be unique amongst all utilities
183    registered during runtime. While you can pick any name you like
184    you might want to prepend ``waeup.kofa.factory.`` to the name
185    string to make sure it does not clash with names of other
186    utilities one day.
187
188    Before all this works we have to grok the baseclass once and our
189    freshly defined factory. This executes all the component
190    registration stuff we don't want to do ourselves. In daily use
191    this is done automatically on startup of a :mod:`waeup.kofa`
192    system.
193
194       >>> grok.testing.grok('waeup.kofa.utils.helpers')
195       >>> grok.testing.grok_component(
196       ...    'MyObjectFactory', MyObjectFactory
197       ...  )
198       True
199
200    After grokking we (and processors) can create objects without
201    knowing about the location of the real class definition, just by
202    the factory name:
203
204       >>> from zope.component import createObject
205       >>> obj = createObject('waeup.kofa.factory.MyObject')
206       >>> isinstance(obj, MyObject)
207       True
208
209    We can also use the regular utility lookups to find our new
210    factory:
211
212       >>> from zope.component import getUtility
213       >>> from zope.component.interfaces import IFactory
214       >>> factory = getUtility(
215       ...   IFactory, name='waeup.kofa.factory.MyObject'
216       ...   )
217       >>> isinstance(factory, MyObjectFactory)
218       True
219
220    And this factory generates `MyObject` instances:
221
222       >>> obj = factory()
223       >>> isinstance(obj, MyObject)
224       True
225
226    """
227    grok.baseclass() # Do not grok this class, do not register us.
228    grok.implements(IFactory)
229    # You can override any of the following attributes in derived
230    # classes. The `grok.name` setting *must* even be set to some
231    # unique value.
232    grok.name(u'waeup.Factory')
233    title = u"Create instances of ``factory``.",
234    description = u"This factory instantiates new applicant instances."
235    factory = None
236
237    def __call__(self, *args, **kw):
238        """The main factory function.
239
240        Returns an instance of the requested object.
241        """
242        return self.factory()
243
244    def getInterfaces(self):
245        # Required by IFactory
246        return implementedBy(self.factory)
247
248def ReST2HTML_w_warnings(source_string):
249    """Convert a reStructuredText string to HTML preserving warnings.
250
251    Returns a tuple ``(<HTML_CODE>, <WARNINGS>)``, both being
252    strings. Where ``<HTML_CODE>`` is the HTML code generated from the
253    source string (in unicode), ``<WARNINGS>`` is a string containing
254    any warning messages or ``None``.
255
256    Regular multi-line ReStructuredText strings will be returned as
257    HTML code:
258
259        >>> from waeup.kofa.utils.helpers import ReST2HTML
260        >>> source = '''
261        ... Headline
262        ... ========
263        ...
264        ... - A list item
265        ... - Another item
266        ...
267        ... Thanks for watching!
268        ... '''
269        >>> html, warnings = ReST2HTML_w_warnings(source)
270        >>> print html
271        <div class="document" id="headline">
272        <h1 class="title">Headline</h1>
273        <BLANKLINE>
274        <ul class="simple">
275        <li>A list item</li>
276        <li>Another item</li>
277        </ul>
278        <p>Thanks for watching!</p>
279        </div>
280
281    Here no warnings happened, so the `warnings` are ``None``:
282
283        >>> warnings is None
284        True
285
286    If warnings happen then they can be retrieved in the returned
287    ``warnings``. We try to render an erraneous document:
288
289        >>> source = '''
290        ... Headline
291        ... ======
292        ...
293        ... Thanks for watching!
294        ... '''
295        >>> html, warnings = ReST2HTML_w_warnings(source)
296        >>> print html
297        <div class="document" id="headline">
298        <h1 class="title">Headline</h1>
299        <BLANKLINE>
300        <p>Thanks for watching!</p>
301        </div>
302
303        >>> print warnings
304        <string>:3: (WARNING/2) Title underline too short.
305        <BLANKLINE>
306        Headline
307        ======
308        <BLANKLINE>
309
310    As you can see, the warnings are not displayed inline the document
311    but can be retrieved from the returned warnings, which is a string
312    or ``None``.
313    """
314    warnings = StringIO()
315    fulldoc = publish_string(
316        source_string, writer_name='html4css1',
317        settings_overrides={
318            'report_level': 0,
319            'warning_stream': warnings,
320            })
321    warnings.seek(0)
322    warning_msgs = warnings.read()
323    if warning_msgs:
324        # Render again, this time with no warnings inline...
325        fulldoc =  publish_string(
326        source_string, writer_name='html4css1',
327        settings_overrides={
328            'report_level': 10000,
329            'halt_level': 10000,
330            'warning_stream': warnings,
331            })
332    if warning_msgs == '':
333        warning_msgs = None
334    result = get_inner_HTML_part(fulldoc).strip()
335    if not isinstance(result, unicode):
336        result = result.decode('utf-8')
337    return result, warning_msgs
338
339def ReST2HTML(source_string):
340    """Render a string containing ReStructuredText to HTML.
341
342    Any warnings about too short headings, etc. are silently
343    discarded. Use :func:`ReST2HTML_w_warnings` if you want to get any
344    warnings.
345
346    The returned string will be unicode.
347
348    A regular document will be rendered like this:
349
350        >>> source = '''
351        ... Headline
352        ... ========
353        ...
354        ... Thanks for watching!
355        ... '''
356        >>> html = ReST2HTML(source)
357        >>> print html
358        <div class="document" id="headline">
359        <h1 class="title">Headline</h1>
360        <BLANKLINE>
361        <p>Thanks for watching!</p>
362        </div>
363
364    A document with markup problems (here: the underline is too short)
365    will look similar:
366
367        >>> source = '''
368        ... Headline
369        ... ======
370        ...
371        ... Thanks for watching!
372        ... '''
373        >>> html = ReST2HTML(source)
374        >>> print html
375        <div class="document" id="headline">
376        <h1 class="title">Headline</h1>
377        <BLANKLINE>
378        <p>Thanks for watching!</p>
379        </div>
380
381    """
382    html, warnings = ReST2HTML_w_warnings(source_string)
383    return html
384
385def attrs_to_fields(cls):
386    """Turn the attributes of a class into FieldProperty instances.
387
388    With Python >= 2.6 we can even use this function as a class decorator.
389    """
390    iface = list(implementedBy(cls))[0]
391    for field_name in getFieldNames(iface):
392        setattr(cls, field_name, FieldProperty(iface[field_name]))
393    return cls
394
395def get_current_principal():
396    """Get the 'current' principal.
397
398    This method works without a request. Examining a request is the
399    regular (and recommended) way to get a principal involved
400    'currently'.
401
402    Use this method only if you really have no access to the current
403    request.
404
405    Returns ``None`` when no principal is involved (for instance
406    during tests).
407    """
408    try:
409        principal = getInteraction().participations[0].principal
410    except NoInteraction:
411        return None
412    except IndexError: # No participations present
413        return None
414    return principal
415
416def cmp_files(file_descr1, file_descr2):
417    """Compare two files by their file descriptors.
418
419    Returns ``True`` if both are equal, ``False`` otherwise.
420    """
421    file_descr1.seek(0)
422    file_descr2.seek(0)
423    while True:
424        b1 = file_descr1.read(BUFSIZE)
425        b2 = file_descr2.read(BUFSIZE)
426        if b1 != b2:
427            return False
428        if not b1:
429            return True
430
431def string_from_bytes(number):
432    """Turn a number into some textual representation.
433
434      Examples:
435
436        >>> string_from_bytes(1)
437        u'1 byte(s)'
438
439        >>> string_from_bytes(1025)
440        u'1 KB'
441
442        >>> string_from_bytes(1.5 * 1024*1024)
443        u'1.50 MB'
444
445        >>> string_from_bytes(673.286 * 1024**3)
446        u'673.29 GB'
447
448    """
449    if number < 1024:
450        return u'%s byte(s)' % (str(number),)
451    elif number < 1024**2:
452        return u'%s KB' % (number / 1024,)
453    elif number < 1024**3:
454        return u'%.2f MB' % (number / 1024**2,)
455    return u'%.2f GB' % (number / 1024**3,)
456
457def file_size(file_like_obj):
458    """Determine file size in most effective manner.
459
460    Returns the number of bytes in a file. This function works for
461    both, real files as well as file-like objects like cStringIO based
462    'files'.
463
464    Example:
465
466      >>> from cStringIO import StringIO
467      >>> file_size(StringIO('my file content'))
468      15
469
470    Please note that this function expects the file-like object passed
471    in to be at first reading position (it does no seek(0)) and that
472    when finished the file pointer might be at end of file.
473    """
474    if hasattr(file_like_obj, 'fileno'):
475        return os.fstat(file_like_obj.fileno())[6]
476    file_like_obj.seek(0, 2) # seek to last position in file
477    return file_like_obj.tell()
478
479def get_user_account(request):
480    """Return local user account.
481    """
482    principal_id = request.principal.id
483    authenticator = getUtility(IAuthenticatorPlugin, name='users')
484    account = authenticator.getAccount(principal_id)
485    return account
486
487def iface_names(iface, omit=[], exclude_attribs=True, exclude_methods=True):
488    """Get all attribute names of an interface.
489
490    Searches also base interfaces.
491
492    Names of fields that are pure attributes
493    (i.e. zope.interface.Attribute) or methods are excluded by
494    default.
495
496    Names of typical fields derived from zope.schema are included.
497
498    The `omit` paramter can give a list of names to exclude.
499
500    Returns an unsorted list of strings.
501    """
502    ifaces = set((iface,))
503    # Collect all interfaces (also bases) recursively
504    while True:
505        ext_ifaces = set(ifaces)
506        for iface in ext_ifaces:
507            ext_ifaces = set.union(ext_ifaces, set(iface.getBases()))
508        if ext_ifaces == ifaces:
509            # No new interfaces found, list complete
510            break
511        ifaces = ext_ifaces
512    # Collect (filtered) names of collected interfaces
513    result = []
514    for iface in ifaces:
515        for name, descr in iface.namesAndDescriptions():
516            if name in omit:
517                continue
518            if exclude_attribs and descr.__class__ is Attribute:
519                continue
520            if exclude_methods and isinstance(descr, Method):
521                continue
522            if name in result:
523                continue
524            result.append(name)
525    return result
526
527def get_sorted_preferred(tuples_iterable, preferred_list):
528    """Get a list of tuples (<TITLE>,<TOKEN>) with values in
529    `preferred_list` put in front.
530
531    The rest of the tuples iterable is returned in orginal order. This
532    is useful for putting default entries on top of (already sorted)
533    lists of choice values, for instance when sorting countries and
534    their code.
535
536    Sample:
537
538    We have a list of tuples with uppercase 'titles' and lowercase
539    'tokens'. This list is already sorted but we want certain values
540    of this list to show up before other values. For instance we want
541    to see the 'C' entry to come first.
542
543      >>> get_sorted_preferred([('A','a'), ('B','b'), ('C','c')],
544      ...                       ['c'])
545      (('C', 'c'), ('A', 'a'), ('B', 'b'))
546
547    i.e. the entry with 'c' as second value moved to head of result.
548
549    We can also require multiple entries at head of list:
550
551      >>> get_sorted_preferred([('A','a'), ('B','b'), ('C','c')],
552      ...                       ['b', 'c'])
553      (('B', 'b'), ('C', 'c'), ('A', 'a'))
554
555    We required the 'b' entry to come before the 'c' entry and then
556    the rest of the input list. That's what we got.
557
558    The result is returned as a tuple of tuples to keep order of values.
559    """
560    result = [None for x in preferred_list]
561    for title, code in tuples_iterable:
562        if code in preferred_list:
563            index = preferred_list.index(code)
564            result[index] = (title, code)
565        else:
566            result.append((title, code))
567    return tuple(result)
568
569def now(tz=None):
570    """Get current datetime in timezone of `tz`.
571
572    If `tz`, a `tzinfo` instance, is None, UTC time is returned.
573
574    `tz` should be a timezone as defined in pytz.
575    """
576    return to_timezone(datetime.datetime.utcnow(), tz=tz)
577
578def to_timezone(dt, tz=None):
579    """Shift datetime into timezone `tz`.
580
581    If datetime `dt` contains no `tzinfo` (i.e. it is 'naive'), it is
582    assumed to be UTC.
583
584    If no `tz` is given, shift to UTC is performed.
585
586    If `dt` is not a datetime.datetime, the input value is returned
587    unchanged.
588    """
589    if not isinstance(dt, datetime.datetime):
590        return dt
591    if tz is None:
592        tz = pytz.utc
593    if dt.tzinfo is None:
594        dt = pytz.utc.localize(dt)
595    return tz.normalize(dt.tzinfo.normalize(dt).astimezone(tz))
596
597def get_fileformat(path, bytestream=None):
598    """Try to determine the file format of a given media file.
599
600    Although checks done here are not done very thoroughly, they make
601    no assumptions about the filetype by looking at its filename
602    extension or similar. Instead they check header data to comply
603    with common known rules (Magic Words).
604
605    If bytestream is not `None` the `path` is ignored.
606
607    Returns filetype as string (something like ``'jpg'``) if
608    file-format can be recognized, ``None`` else.
609
610    Tested recognized filetypes currently are `jpg`, `png`, and `pdf`.
611
612    More filetypes (though untested in waeup.kofa) are automatically
613    recognized because we deploy the stdlib `imghdr` library. See this
614    module's docs for a complete list of filetypes recognized.
615    """
616    if path is None and bytestream is None:
617        return None
618
619    img_type = None
620    if bytestream is not None:
621        img_type = imghdr.what(path, bytestream)
622    else:
623        img_type = imghdr.what(path)
624    for name, replacement in (('jpeg', 'jpg'), ('tiff', 'tif')):
625        if img_type == name:
626            img_type = replacement
627    return img_type
628
629def check_pdf(bytestream, file):
630    """Tell whether a file or bytestream is a PDF file.
631
632    Works as a test/plugin for the stdlib `imghdr` library.
633    """
634    if file is not None:
635        file.seek(0)
636        bytestream = file.read(4)
637        file.seek(0)
638
639    if bytestream.startswith('%PDF'):
640        return 'pdf'
641    return None
642
643# register check_pdf as header check function with `imghdr`
644if check_pdf not in imghdr.tests:
645    imghdr.tests.append(check_pdf)
646
647def merge_csv_files(path1, path2):
648    """Merge two CSV files into one (appending).
649
650    CSV data from `path2` will be merged into `path1` csv file. This
651    is a bit like 'appending' data from path2 to data from path1.
652
653    The path of the resulting temporary file will be returned.
654
655    In the result file data from `path2` will always come _after_ data
656    from `path1`.
657
658    **Caution**: It is the _callers_ responsibility to remove the
659    result file (which is created by tempfile.mkstemp) after usage.
660
661    This CSV file merging copes with different column orders in both
662    CSV files and even with different column sets in both files.
663
664    Also broken/empty CSV files can be handled.
665    """
666    # sniff the col names
667    try:
668        row10 = csv.DictReader(open(path1, 'rb')).next()
669    except StopIteration:
670        row10 = dict()
671    try:
672        row20 = csv.DictReader(open(path2, 'rb')).next()
673    except StopIteration:
674        row20 = dict()
675    fieldnames = sorted(list(set(row10.keys() + row20.keys())))
676    # now read/write the real data
677    reader1 = csv.DictReader(open(path1, 'rb'))
678    reader2 = csv.DictReader(open(path2, 'rb'))
679    wp, tmp_path = tempfile.mkstemp()
680    writer = csv.DictWriter(os.fdopen(wp, 'wb'), fieldnames)
681    writer.writerow(dict((x,x) for x in fieldnames)) # header
682    for row in reader1:
683        writer.writerow(row)
684    for row in reader2:
685        writer.writerow(row)
686    return tmp_path
Note: See TracBrowser for help on using the repository browser.