Context navigation

source: main/waeup.kofa/trunk/src/waeup/kofa/utils/helpers.py @ 10389

Last change on this file since 10389 was 10028, checked in by uli, 12 years ago
Further updates of CSV-related places.
Property svn:keywords set to `Id`
File size: 22.4 KB

Line
1	## $Id: helpers.py 10028 2013-03-15 01:12:42Z uli $
2	##
3	## Copyright (C) 2011 Uli Fouquet & Henrik Bettermann
4	## This program is free software; you can redistribute it and/or modify
5	## it under the terms of the GNU General Public License as published by
6	## the Free Software Foundation; either version 2 of the License, or
7	## (at your option) any later version.
8	##
9	## This program is distributed in the hope that it will be useful,
10	## but WITHOUT ANY WARRANTY; without even the implied warranty of
11	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12	## GNU General Public License for more details.
13	##
14	## You should have received a copy of the GNU General Public License
15	## along with this program; if not, write to the Free Software
16	## Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17	##
18	"""General helper functions for Kofa.
19	"""
20	import unicodecsv as csv # XXX: csv ops should move to dedicated module.
21	import datetime
22	import imghdr
23	import logging
24	import os
25	import pytz
26	import re
27	import shutil
28	import tempfile
29	import grok
30	from cStringIO import StringIO
31	from docutils.core import publish_string
32	from zope.component import getUtility
33	from zope.component.interfaces import IFactory
34	from zope.interface import implementedBy
35	from zope.interface.interface import Method, Attribute
36	from zope.schema import getFieldNames
37	from zope.schema.fieldproperty import FieldProperty
38	from zope.security.interfaces import NoInteraction
39	from zope.security.management import getInteraction
40	from zope.pluggableauth.interfaces import IAuthenticatorPlugin
41	from waeup.kofa.interfaces import MessageFactory as _
42
43	BUFSIZE = 8 * 1024
44
45	def remove_file_or_directory(filepath):
46	"""Remove a file or directory.
47
48	Different to :func:`shutil.rmtree` we also accept not existing
49	paths (returning silently) and if a dir turns out to be a regular
50	file, we remove that.
51	"""
52	filepath = os.path.abspath(filepath)
53	if not os.path.exists(filepath):
54	return
55	if os.path.isdir(filepath):
56	shutil.rmtree(filepath)
57	else:
58	os.unlink(filepath)
59	return
60
61	def copy_filesystem_tree(src, dst, overwrite=False, del_old=False):
62	"""Copy contents of directory src to directory dst.
63
64	Both directories must exists.
65
66	If `overwrite` is true, any same named objects will be
67	overwritten. Otherwise these files will not be touched.
68
69	If `del_old` is true, copied files and directories will be removed
70	from the src directory.
71
72	This functions returns a list of non-copied files.
73
74	Unix hidden files and directories (starting with '.') are not
75	processed by this function.
76	"""
77	if not os.path.exists(src):
78	raise ValueError('source path does not exist: %s' % src)
79	if not os.path.exists(dst):
80	raise ValueError('destination path does not exist: %s' % dst)
81	if not os.path.isdir(src):
82	raise ValueError('source path is not a directory: %s' % src)
83	if not os.path.isdir(dst):
84	raise ValueError('destination path is not a directory: %s' % dst)
85	not_copied = []
86	for item in os.listdir(src):
87	if item.startswith('.'):
88	continue # We do not copy hidden stuff...
89	itemsrc = os.path.join(src, item)
90	itemdst = os.path.join(dst, item)
91
92	if os.path.exists(itemdst):
93	if overwrite is True:
94	remove_file_or_directory(itemdst)
95	else:
96	not_copied.append(item)
97	continue
98
99	if os.path.isdir(itemsrc):
100	shutil.copytree(itemsrc, itemdst)
101	else:
102	shutil.copy2(itemsrc, itemdst)
103	if del_old:
104	remove_file_or_directory(itemsrc)
105	return not_copied
106
107
108	def get_inner_HTML_part(html_code):
109	"""Return the 'inner' part of a complete HTML snippet.
110
111	If there is a form part, get this.
112
113	If there is no form part, try to return the body part contents.
114
115	If there is no body, return as-is.
116
117	Let's see how that works. If we deliver some doc with form, we
118	will get that form only:
119
120	>>> doc = '<html><form>My Form</form>Outside the form</html>'
121	>>> get_inner_HTML_part(doc)
122	'<form>My Form</form>'
123
124	No form? Then seek for a body part and get the contents:
125
126	>>> doc = '<html><body>My Body</body>Trailing Trash</html>'
127	>>> get_inner_HTML_part(doc)
128	'My Body'
129
130	If none of these is included, return what we got:
131
132	>>> doc = '<html>without body nor form</html>'
133	>>> get_inner_HTML_part(doc)
134	'<html>without body nor form</html>'
135
136	"""
137
138	try:
139	result = re.match('^.+(<form[^\>]>.</form>).+$', html_code,
140	re.DOTALL).groups()[0]
141	return result
142	except AttributeError:
143	# No <form> part included
144	try:
145	result = re.match('^.+<body[^\>]>(.)</body>.*$', html_code,
146	re.DOTALL).groups()[0]
147	return result
148	except AttributeError:
149	# No <form> and no <body> tag...
150	pass
151	return html_code
152
153	class FactoryBase(grok.GlobalUtility):
154	"""A factory for things.
155
156	This is a baseclass for easier creation of factories. Factories
157	are utilities that are registered under a certain name and return
158	instances of certain classes when called.
159
160	In :mod:`waeup.kofa` we use factories extensively for
161	batching. While processing a batch some processors looks up a
162	factory to create real-world instances that then get filled with
163	data from imported CSV files.
164
165	To get rid of reimplementing the same stuff over and over again,
166	most notably the methods defined here, we offer this base class
167	(which will not be registered as a factory itself).
168
169	Real factories can then be created like this:
170
171	>>> import grok
172	>>> from waeup.kofa.utils.helpers import FactoryBase
173	>>> class MyObject(object):
174	... # Some class we want to get instances of.
175	... pass
176	>>> class MyObjectFactory(FactoryBase):
177	... # This is the factory for MyObject instances
178	... grok.name(u'waeup.kofa.factory.MyObject')
179	... factory = MyObject
180
181	That's it. It is essential to set the ``factory`` attribute, which
182	will determine the class of which instances should be created when
183	called. The given name must even be unique amongst all utilities
184	registered during runtime. While you can pick any name you like
185	you might want to prepend ``waeup.kofa.factory.`` to the name
186	string to make sure it does not clash with names of other
187	utilities one day.
188
189	Before all this works we have to grok the baseclass once and our
190	freshly defined factory. This executes all the component
191	registration stuff we don't want to do ourselves. In daily use
192	this is done automatically on startup of a :mod:`waeup.kofa`
193	system.
194
195	>>> grok.testing.grok('waeup.kofa.utils.helpers')
196	>>> grok.testing.grok_component(
197	... 'MyObjectFactory', MyObjectFactory
198	... )
199	True
200
201	After grokking we (and processors) can create objects without
202	knowing about the location of the real class definition, just by
203	the factory name:
204
205	>>> from zope.component import createObject
206	>>> obj = createObject('waeup.kofa.factory.MyObject')
207	>>> isinstance(obj, MyObject)
208	True
209
210	We can also use the regular utility lookups to find our new
211	factory:
212
213	>>> from zope.component import getUtility
214	>>> from zope.component.interfaces import IFactory
215	>>> factory = getUtility(
216	... IFactory, name='waeup.kofa.factory.MyObject'
217	... )
218	>>> isinstance(factory, MyObjectFactory)
219	True
220
221	And this factory generates `MyObject` instances:
222
223	>>> obj = factory()
224	>>> isinstance(obj, MyObject)
225	True
226
227	"""
228	grok.baseclass() # Do not grok this class, do not register us.
229	grok.implements(IFactory)
230	# You can override any of the following attributes in derived
231	# classes. The `grok.name` setting must even be set to some
232	# unique value.
233	grok.name(u'waeup.Factory')
234	title = u"Create instances of ``factory``.",
235	description = u"This factory instantiates new applicant instances."
236	factory = None
237
238	def __call__(self, args, *kw):
239	"""The main factory function.
240
241	Returns an instance of the requested object.
242	"""
243	return self.factory()
244
245	def getInterfaces(self):
246	# Required by IFactory
247	return implementedBy(self.factory)
248
249	def ReST2HTML_w_warnings(source_string):
250	"""Convert a reStructuredText string to HTML preserving warnings.
251
252	Returns a tuple ``(<HTML_CODE>, <WARNINGS>)``, both being
253	strings. Where ``<HTML_CODE>`` is the HTML code generated from the
254	source string (in unicode), ``<WARNINGS>`` is a string containing
255	any warning messages or ``None``.
256
257	Regular multi-line ReStructuredText strings will be returned as
258	HTML code:
259
260	>>> from waeup.kofa.utils.helpers import ReST2HTML
261	>>> source = '''
262	... Headline
263	... ========
264	...
265	... - A list item
266	... - Another item
267	...
268	... Thanks for watching!
269	... '''
270	>>> html, warnings = ReST2HTML_w_warnings(source)
271	>>> print html
272	<div class="document" id="headline">
273	<h1 class="title">Headline</h1>
274	<BLANKLINE>
275	<ul class="simple">
276	<li>A list item</li>
277	<li>Another item</li>
278	</ul>
279	<p>Thanks for watching!</p>
280	</div>
281
282	Here no warnings happened, so the `warnings` are ``None``:
283
284	>>> warnings is None
285	True
286
287	If warnings happen then they can be retrieved in the returned
288	``warnings``. We try to render an erraneous document:
289
290	>>> source = '''
291	... Headline
292	... ======
293	...
294	... Thanks for watching!
295	... '''
296	>>> html, warnings = ReST2HTML_w_warnings(source)
297	>>> print html
298	<div class="document" id="headline">
299	<h1 class="title">Headline</h1>
300	<BLANKLINE>
301	<p>Thanks for watching!</p>
302	</div>
303
304	>>> print warnings
305	<string>:3: (WARNING/2) Title underline too short.
306	<BLANKLINE>
307	Headline
308	======
309	<BLANKLINE>
310
311	As you can see, the warnings are not displayed inline the document
312	but can be retrieved from the returned warnings, which is a string
313	or ``None``.
314	"""
315	warnings = StringIO()
316	fulldoc = publish_string(
317	source_string, writer_name='html4css1',
318	settings_overrides={
319	'report_level': 0,
320	'warning_stream': warnings,
321	})
322	warnings.seek(0)
323	warning_msgs = warnings.read()
324	if warning_msgs:
325	# Render again, this time with no warnings inline...
326	fulldoc = publish_string(
327	source_string, writer_name='html4css1',
328	settings_overrides={
329	'report_level': 10000,
330	'halt_level': 10000,
331	'warning_stream': warnings,
332	})
333	if warning_msgs == '':
334	warning_msgs = None
335	result = get_inner_HTML_part(fulldoc).strip()
336	if not isinstance(result, unicode):
337	result = result.decode('utf-8')
338	return result, warning_msgs
339
340	def ReST2HTML(source_string):
341	"""Render a string containing ReStructuredText to HTML.
342
343	Any warnings about too short headings, etc. are silently
344	discarded. Use :func:`ReST2HTML_w_warnings` if you want to get any
345	warnings.
346
347	The returned string will be unicode.
348
349	A regular document will be rendered like this:
350
351	>>> source = '''
352	... Headline
353	... ========
354	...
355	... Thanks for watching!
356	... '''
357	>>> html = ReST2HTML(source)
358	>>> print html
359	<div class="document" id="headline">
360	<h1 class="title">Headline</h1>
361	<BLANKLINE>
362	<p>Thanks for watching!</p>
363	</div>
364
365	A document with markup problems (here: the underline is too short)
366	will look similar:
367
368	>>> source = '''
369	... Headline
370	... ======
371	...
372	... Thanks for watching!
373	... '''
374	>>> html = ReST2HTML(source)
375	>>> print html
376	<div class="document" id="headline">
377	<h1 class="title">Headline</h1>
378	<BLANKLINE>
379	<p>Thanks for watching!</p>
380	</div>
381
382	"""
383	html, warnings = ReST2HTML_w_warnings(source_string)
384	return html
385
386	def attrs_to_fields(cls, omit=[]):
387	"""Turn the attributes of a class into FieldProperty instances.
388
389	With Python >= 2.6 we can even use this function as a class decorator.
390
391	`omit` is a list of field names that should _not_ be turned into
392	field properties. This is useful for properties and the like.
393	"""
394	iface = list(implementedBy(cls))[0]
395	for field_name in getFieldNames(iface):
396	if field_name in omit:
397	continue
398	setattr(cls, field_name, FieldProperty(iface[field_name]))
399	return cls
400
401	def get_current_principal():
402	"""Get the 'current' principal.
403
404	This method works without a request. Examining a request is the
405	regular (and recommended) way to get a principal involved
406	'currently'.
407
408	Use this method only if you really have no access to the current
409	request.
410
411	Returns ``None`` when no principal is involved (for instance
412	during tests).
413	"""
414	try:
415	principal = getInteraction().participations[0].principal
416	except NoInteraction:
417	return None
418	except IndexError: # No participations present
419	return None
420	return principal
421
422	def cmp_files(file_descr1, file_descr2):
423	"""Compare two files by their file descriptors.
424
425	Returns ``True`` if both are equal, ``False`` otherwise.
426	"""
427	file_descr1.seek(0)
428	file_descr2.seek(0)
429	while True:
430	b1 = file_descr1.read(BUFSIZE)
431	b2 = file_descr2.read(BUFSIZE)
432	if b1 != b2:
433	return False
434	if not b1:
435	return True
436
437	def string_from_bytes(number):
438	"""Turn a number into some textual representation.
439
440	Examples:
441
442	>>> string_from_bytes(1)
443	u'1 byte(s)'
444
445	>>> string_from_bytes(1025)
446	u'1 KB'
447
448	>>> string_from_bytes(1.5 * 1024*1024)
449	u'1.50 MB'
450
451	>>> string_from_bytes(673.286 * 1024**3)
452	u'673.29 GB'
453
454	"""
455	if number < 1024:
456	return u'%s byte(s)' % (str(number),)
457	elif number < 1024**2:
458	return u'%s KB' % (number / 1024,)
459	elif number < 1024**3:
460	return u'%.2f MB' % (number / 1024**2,)
461	return u'%.2f GB' % (number / 1024**3,)
462
463	def file_size(file_like_obj):
464	"""Determine file size in most effective manner.
465
466	Returns the number of bytes in a file. This function works for
467	both, real files as well as file-like objects like cStringIO based
468	'files'.
469
470	Example:
471
472	>>> from cStringIO import StringIO
473	>>> file_size(StringIO('my file content'))
474	15
475
476	Please note that this function expects the file-like object passed
477	in to be at first reading position (it does no seek(0)) and that
478	when finished the file pointer might be at end of file.
479	"""
480	if hasattr(file_like_obj, 'fileno'):
481	return os.fstat(file_like_obj.fileno())[6]
482	file_like_obj.seek(0, 2) # seek to last position in file
483	return file_like_obj.tell()
484
485	def get_user_account(request):
486	"""Return local user account.
487	"""
488	principal_id = request.principal.id
489	authenticator = getUtility(IAuthenticatorPlugin, name='users')
490	account = authenticator.getAccount(principal_id)
491	return account
492
493	def iface_names(iface, omit=[], exclude_attribs=True, exclude_methods=True):
494	"""Get all attribute names of an interface.
495
496	Searches also base interfaces.
497
498	Names of fields that are pure attributes
499	(i.e. zope.interface.Attribute) or methods are excluded by
500	default.
501
502	Names of typical fields derived from zope.schema are included.
503
504	The `omit` paramter can give a list of names to exclude.
505
506	Returns an unsorted list of strings.
507	"""
508	ifaces = set((iface,))
509	# Collect all interfaces (also bases) recursively
510	while True:
511	ext_ifaces = set(ifaces)
512	for iface in ext_ifaces:
513	ext_ifaces = set.union(ext_ifaces, set(iface.getBases()))
514	if ext_ifaces == ifaces:
515	# No new interfaces found, list complete
516	break
517	ifaces = ext_ifaces
518	# Collect (filtered) names of collected interfaces
519	result = []
520	for iface in ifaces:
521	for name, descr in iface.namesAndDescriptions():
522	if name in omit:
523	continue
524	if exclude_attribs and descr.__class__ is Attribute:
525	continue
526	if exclude_methods and isinstance(descr, Method):
527	continue
528	if name in result:
529	continue
530	result.append(name)
531	return result
532
533	def get_sorted_preferred(tuples_iterable, preferred_list):
534	"""Get a list of tuples (<TITLE>,<TOKEN>) with values in
535	`preferred_list` put in front.
536
537	The rest of the tuples iterable is returned in orginal order. This
538	is useful for putting default entries on top of (already sorted)
539	lists of choice values, for instance when sorting countries and
540	their code.
541
542	Sample:
543
544	We have a list of tuples with uppercase 'titles' and lowercase
545	'tokens'. This list is already sorted but we want certain values
546	of this list to show up before other values. For instance we want
547	to see the 'C' entry to come first.
548
549	>>> get_sorted_preferred([('A','a'), ('B','b'), ('C','c')],
550	... ['c'])
551	(('C', 'c'), ('A', 'a'), ('B', 'b'))
552
553	i.e. the entry with 'c' as second value moved to head of result.
554
555	We can also require multiple entries at head of list:
556
557	>>> get_sorted_preferred([('A','a'), ('B','b'), ('C','c')],
558	... ['b', 'c'])
559	(('B', 'b'), ('C', 'c'), ('A', 'a'))
560
561	We required the 'b' entry to come before the 'c' entry and then
562	the rest of the input list. That's what we got.
563
564	The result is returned as a tuple of tuples to keep order of values.
565	"""
566	result = [None for x in preferred_list]
567	for title, code in tuples_iterable:
568	if code in preferred_list:
569	index = preferred_list.index(code)
570	result[index] = (title, code)
571	else:
572	result.append((title, code))
573	return tuple(result)
574
575	def now(tz=None):
576	"""Get current datetime in timezone of `tz`.
577
578	If `tz`, a `tzinfo` instance, is None, UTC time is returned.
579
580	`tz` should be a timezone as defined in pytz.
581	"""
582	return to_timezone(datetime.datetime.utcnow(), tz=tz)
583
584	def to_timezone(dt, tz=None):
585	"""Shift datetime into timezone `tz`.
586
587	If datetime `dt` contains no `tzinfo` (i.e. it is 'naive'), it is
588	assumed to be UTC.
589
590	If no `tz` is given, shift to UTC is performed.
591
592	If `dt` is not a datetime.datetime, the input value is returned
593	unchanged.
594	"""
595	if not isinstance(dt, datetime.datetime):
596	return dt
597	if tz is None:
598	tz = pytz.utc
599	if dt.tzinfo is None:
600	dt = pytz.utc.localize(dt)
601	return tz.normalize(dt.tzinfo.normalize(dt).astimezone(tz))
602
603	def get_fileformat(path, bytestream=None):
604	"""Try to determine the file format of a given media file.
605
606	Although checks done here are not done very thoroughly, they make
607	no assumptions about the filetype by looking at its filename
608	extension or similar. Instead they check header data to comply
609	with common known rules (Magic Words).
610
611	If bytestream is not `None` the `path` is ignored.
612
613	Returns filetype as string (something like ``'jpg'``) if
614	file-format can be recognized, ``None`` else.
615
616	Tested recognized filetypes currently are `jpg`, `png`, and `pdf`.
617
618	More filetypes (though untested in waeup.kofa) are automatically
619	recognized because we deploy the stdlib `imghdr` library. See this
620	module's docs for a complete list of filetypes recognized.
621	"""
622	if path is None and bytestream is None:
623	return None
624
625	img_type = None
626	if bytestream is not None:
627	img_type = imghdr.what(path, bytestream)
628	else:
629	img_type = imghdr.what(path)
630	for name, replacement in (('jpeg', 'jpg'), ('tiff', 'tif')):
631	if img_type == name:
632	img_type = replacement
633	return img_type
634
635	def check_pdf(bytestream, file):
636	"""Tell whether a file or bytestream is a PDF file.
637
638	Works as a test/plugin for the stdlib `imghdr` library.
639	"""
640	if file is not None:
641	file.seek(0)
642	bytestream = file.read(4)
643	file.seek(0)
644
645	if bytestream.startswith('%PDF'):
646	return 'pdf'
647	return None
648
649	# register check_pdf as header check function with `imghdr`
650	if check_pdf not in imghdr.tests:
651	imghdr.tests.append(check_pdf)
652
653	def merge_csv_files(path1, path2):
654	"""Merge two CSV files into one (appending).
655
656	CSV data from `path2` will be merged into `path1` csv file. This
657	is a bit like 'appending' data from path2 to data from path1.
658
659	The path of the resulting temporary file will be returned.
660
661	In the result file data from `path2` will always come _after_ data
662	from `path1`.
663
664	Caution: It is the _callers_ responsibility to remove the
665	result file (which is created by tempfile.mkstemp) after usage.
666
667	This CSV file merging copes with different column orders in both
668	CSV files and even with different column sets in both files.
669
670	Also broken/empty CSV files can be handled.
671	"""
672	# sniff the col names
673	try:
674	row10 = csv.DictReader(open(path1, 'rb')).next()
675	except StopIteration:
676	row10 = dict()
677	try:
678	row20 = csv.DictReader(open(path2, 'rb')).next()
679	except StopIteration:
680	row20 = dict()
681	fieldnames = sorted(list(set(row10.keys() + row20.keys())))
682	# now read/write the real data
683	reader1 = csv.DictReader(open(path1, 'rb'))
684	reader2 = csv.DictReader(open(path2, 'rb'))
685	wp, tmp_path = tempfile.mkstemp()
686	writer = csv.DictWriter(os.fdopen(wp, 'wb'), fieldnames)
687	writer.writerow(dict((x,x) for x in fieldnames)) # header
688	for row in reader1:
689	writer.writerow(row)
690	for row in reader2:
691	writer.writerow(row)
692	return tmp_path
693
694	def product(sequence, start=1):
695	"""Returns the product of a sequence of numbers (_not_ strings)
696	multiplied by the parameter `start` (defaults to 1). If the
697	sequence is empty, returns 0.
698	"""
699	if not len(sequence):
700	return 0
701	result = start
702	for item in sequence:
703	result *= item
704	return result
705
706	class NullHandler(logging.Handler):
707	"""A logging NullHandler.
708
709	Does not log anything. Useful if you want to shut up a log.
710
711	Defined here for backwards compatibility with Python < 2.7.
712	"""
713	def emit(self, record):
714	pass

Note: See TracBrowser for help on using the repository browser.

Download in other formats: