Context navigation

source: main/waeup.kofa/trunk/src/waeup/kofa/utils/helpers.py @ 8685

Last change on this file since 8685 was 8633, checked in by uli, 13 years ago
Provide the more robust CSV file merging.
Property svn:keywords set to `Id`
File size: 21.5 KB

Line
1	## $Id: helpers.py 8633 2012-06-06 01:11:19Z uli $
2	##
3	## Copyright (C) 2011 Uli Fouquet & Henrik Bettermann
4	## This program is free software; you can redistribute it and/or modify
5	## it under the terms of the GNU General Public License as published by
6	## the Free Software Foundation; either version 2 of the License, or
7	## (at your option) any later version.
8	##
9	## This program is distributed in the hope that it will be useful,
10	## but WITHOUT ANY WARRANTY; without even the implied warranty of
11	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12	## GNU General Public License for more details.
13	##
14	## You should have received a copy of the GNU General Public License
15	## along with this program; if not, write to the Free Software
16	## Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17	##
18	"""General helper functions for Kofa.
19	"""
20	import csv
21	import datetime
22	import imghdr
23	import os
24	import pytz
25	import re
26	import shutil
27	import tempfile
28	import grok
29	from cStringIO import StringIO
30	from docutils.core import publish_string
31	from zope.component import getUtility
32	from zope.component.interfaces import IFactory
33	from zope.interface import implementedBy
34	from zope.interface.interface import Method, Attribute
35	from zope.schema import getFieldNames
36	from zope.schema.fieldproperty import FieldProperty
37	from zope.security.interfaces import NoInteraction
38	from zope.security.management import getInteraction
39	from zope.pluggableauth.interfaces import IAuthenticatorPlugin
40
41	BUFSIZE = 8 * 1024
42
43	def remove_file_or_directory(filepath):
44	"""Remove a file or directory.
45
46	Different to :func:`shutil.rmtree` we also accept not existing
47	paths (returning silently) and if a dir turns out to be a regular
48	file, we remove that.
49	"""
50	filepath = os.path.abspath(filepath)
51	if not os.path.exists(filepath):
52	return
53	if os.path.isdir(filepath):
54	shutil.rmtree(filepath)
55	else:
56	os.unlink(filepath)
57	return
58
59	def copy_filesystem_tree(src, dst, overwrite=False, del_old=False):
60	"""Copy contents of directory src to directory dst.
61
62	Both directories must exists.
63
64	If `overwrite` is true, any same named objects will be
65	overwritten. Otherwise these files will not be touched.
66
67	If `del_old` is true, copied files and directories will be removed
68	from the src directory.
69
70	This functions returns a list of non-copied files.
71
72	Unix hidden files and directories (starting with '.') are not
73	processed by this function.
74	"""
75	if not os.path.exists(src):
76	raise ValueError('source path does not exist: %s' % src)
77	if not os.path.exists(dst):
78	raise ValueError('destination path does not exist: %s' % dst)
79	if not os.path.isdir(src):
80	raise ValueError('source path is not a directory: %s' % src)
81	if not os.path.isdir(dst):
82	raise ValueError('destination path is not a directory: %s' % dst)
83	not_copied = []
84	for item in os.listdir(src):
85	if item.startswith('.'):
86	continue # We do not copy hidden stuff...
87	itemsrc = os.path.join(src, item)
88	itemdst = os.path.join(dst, item)
89
90	if os.path.exists(itemdst):
91	if overwrite is True:
92	remove_file_or_directory(itemdst)
93	else:
94	not_copied.append(item)
95	continue
96
97	if os.path.isdir(itemsrc):
98	shutil.copytree(itemsrc, itemdst)
99	else:
100	shutil.copy2(itemsrc, itemdst)
101	if del_old:
102	remove_file_or_directory(itemsrc)
103	return not_copied
104
105
106	def get_inner_HTML_part(html_code):
107	"""Return the 'inner' part of a complete HTML snippet.
108
109	If there is a form part, get this.
110
111	If there is no form part, try to return the body part contents.
112
113	If there is no body, return as-is.
114
115	Let's see how that works. If we deliver some doc with form, we
116	will get that form only:
117
118	>>> doc = '<html><form>My Form</form>Outside the form</html>'
119	>>> get_inner_HTML_part(doc)
120	'<form>My Form</form>'
121
122	No form? Then seek for a body part and get the contents:
123
124	>>> doc = '<html><body>My Body</body>Trailing Trash</html>'
125	>>> get_inner_HTML_part(doc)
126	'My Body'
127
128	If none of these is included, return what we got:
129
130	>>> doc = '<html>without body nor form</html>'
131	>>> get_inner_HTML_part(doc)
132	'<html>without body nor form</html>'
133
134	"""
135
136	try:
137	result = re.match('^.+(<form[^\>]>.</form>).+$', html_code,
138	re.DOTALL).groups()[0]
139	return result
140	except AttributeError:
141	# No <form> part included
142	try:
143	result = re.match('^.+<body[^\>]>(.)</body>.*$', html_code,
144	re.DOTALL).groups()[0]
145	return result
146	except AttributeError:
147	# No <form> and no <body> tag...
148	pass
149	return html_code
150
151	class FactoryBase(grok.GlobalUtility):
152	"""A factory for things.
153
154	This is a baseclass for easier creation of factories. Factories
155	are utilities that are registered under a certain name and return
156	instances of certain classes when called.
157
158	In :mod:`waeup.kofa` we use factories extensively for
159	batching. While processing a batch some processors looks up a
160	factory to create real-world instances that then get filled with
161	data from imported CSV files.
162
163	To get rid of reimplementing the same stuff over and over again,
164	most notably the methods defined here, we offer this base class
165	(which will not be registered as a factory itself).
166
167	Real factories can then be created like this:
168
169	>>> import grok
170	>>> from waeup.kofa.utils.helpers import FactoryBase
171	>>> class MyObject(object):
172	... # Some class we want to get instances of.
173	... pass
174	>>> class MyObjectFactory(FactoryBase):
175	... # This is the factory for MyObject instances
176	... grok.name(u'waeup.kofa.factory.MyObject')
177	... factory = MyObject
178
179	That's it. It is essential to set the ``factory`` attribute, which
180	will determine the class of which instances should be created when
181	called. The given name must even be unique amongst all utilities
182	registered during runtime. While you can pick any name you like
183	you might want to prepend ``waeup.kofa.factory.`` to the name
184	string to make sure it does not clash with names of other
185	utilities one day.
186
187	Before all this works we have to grok the baseclass once and our
188	freshly defined factory. This executes all the component
189	registration stuff we don't want to do ourselves. In daily use
190	this is done automatically on startup of a :mod:`waeup.kofa`
191	system.
192
193	>>> grok.testing.grok('waeup.kofa.utils.helpers')
194	>>> grok.testing.grok_component(
195	... 'MyObjectFactory', MyObjectFactory
196	... )
197	True
198
199	After grokking we (and processors) can create objects without
200	knowing about the location of the real class definition, just by
201	the factory name:
202
203	>>> from zope.component import createObject
204	>>> obj = createObject('waeup.kofa.factory.MyObject')
205	>>> isinstance(obj, MyObject)
206	True
207
208	We can also use the regular utility lookups to find our new
209	factory:
210
211	>>> from zope.component import getUtility
212	>>> from zope.component.interfaces import IFactory
213	>>> factory = getUtility(
214	... IFactory, name='waeup.kofa.factory.MyObject'
215	... )
216	>>> isinstance(factory, MyObjectFactory)
217	True
218
219	And this factory generates `MyObject` instances:
220
221	>>> obj = factory()
222	>>> isinstance(obj, MyObject)
223	True
224
225	"""
226	grok.baseclass() # Do not grok this class, do not register us.
227	grok.implements(IFactory)
228	# You can override any of the following attributes in derived
229	# classes. The `grok.name` setting must even be set to some
230	# unique value.
231	grok.name(u'waeup.Factory')
232	title = u"Create instances of ``factory``.",
233	description = u"This factory instantiates new applicant instances."
234	factory = None
235
236	def __call__(self, args, *kw):
237	"""The main factory function.
238
239	Returns an instance of the requested object.
240	"""
241	return self.factory()
242
243	def getInterfaces(self):
244	# Required by IFactory
245	return implementedBy(self.factory)
246
247	def ReST2HTML_w_warnings(source_string):
248	"""Convert a reStructuredText string to HTML preserving warnings.
249
250	Returns a tuple ``(<HTML_CODE>, <WARNINGS>)``, both being
251	strings. Where ``<HTML_CODE>`` is the HTML code generated from the
252	source string (in unicode), ``<WARNINGS>`` is a string containing
253	any warning messages or ``None``.
254
255	Regular multi-line ReStructuredText strings will be returned as
256	HTML code:
257
258	>>> from waeup.kofa.utils.helpers import ReST2HTML
259	>>> source = '''
260	... Headline
261	... ========
262	...
263	... - A list item
264	... - Another item
265	...
266	... Thanks for watching!
267	... '''
268	>>> html, warnings = ReST2HTML_w_warnings(source)
269	>>> print html
270	<div class="document" id="headline">
271	<h1 class="title">Headline</h1>
272	<BLANKLINE>
273	<ul class="simple">
274	<li>A list item</li>
275	<li>Another item</li>
276	</ul>
277	<p>Thanks for watching!</p>
278	</div>
279
280	Here no warnings happened, so the `warnings` are ``None``:
281
282	>>> warnings is None
283	True
284
285	If warnings happen then they can be retrieved in the returned
286	``warnings``. We try to render an erraneous document:
287
288	>>> source = '''
289	... Headline
290	... ======
291	...
292	... Thanks for watching!
293	... '''
294	>>> html, warnings = ReST2HTML_w_warnings(source)
295	>>> print html
296	<div class="document" id="headline">
297	<h1 class="title">Headline</h1>
298	<BLANKLINE>
299	<p>Thanks for watching!</p>
300	</div>
301
302	>>> print warnings
303	<string>:3: (WARNING/2) Title underline too short.
304	<BLANKLINE>
305	Headline
306	======
307	<BLANKLINE>
308
309	As you can see, the warnings are not displayed inline the document
310	but can be retrieved from the returned warnings, which is a string
311	or ``None``.
312	"""
313	warnings = StringIO()
314	fulldoc = publish_string(
315	source_string, writer_name='html4css1',
316	settings_overrides={
317	'report_level': 0,
318	'warning_stream': warnings,
319	})
320	warnings.seek(0)
321	warning_msgs = warnings.read()
322	if warning_msgs:
323	# Render again, this time with no warnings inline...
324	fulldoc = publish_string(
325	source_string, writer_name='html4css1',
326	settings_overrides={
327	'report_level': 10000,
328	'halt_level': 10000,
329	'warning_stream': warnings,
330	})
331	if warning_msgs == '':
332	warning_msgs = None
333	result = get_inner_HTML_part(fulldoc).strip()
334	if not isinstance(result, unicode):
335	result = result.decode('utf-8')
336	return result, warning_msgs
337
338	def ReST2HTML(source_string):
339	"""Render a string containing ReStructuredText to HTML.
340
341	Any warnings about too short headings, etc. are silently
342	discarded. Use :func:`ReST2HTML_w_warnings` if you want to get any
343	warnings.
344
345	The returned string will be unicode.
346
347	A regular document will be rendered like this:
348
349	>>> source = '''
350	... Headline
351	... ========
352	...
353	... Thanks for watching!
354	... '''
355	>>> html = ReST2HTML(source)
356	>>> print html
357	<div class="document" id="headline">
358	<h1 class="title">Headline</h1>
359	<BLANKLINE>
360	<p>Thanks for watching!</p>
361	</div>
362
363	A document with markup problems (here: the underline is too short)
364	will look similar:
365
366	>>> source = '''
367	... Headline
368	... ======
369	...
370	... Thanks for watching!
371	... '''
372	>>> html = ReST2HTML(source)
373	>>> print html
374	<div class="document" id="headline">
375	<h1 class="title">Headline</h1>
376	<BLANKLINE>
377	<p>Thanks for watching!</p>
378	</div>
379
380	"""
381	html, warnings = ReST2HTML_w_warnings(source_string)
382	return html
383
384	def attrs_to_fields(cls):
385	"""Turn the attributes of a class into FieldProperty instances.
386
387	With Python >= 2.6 we can even use this function as a class decorator.
388	"""
389	iface = list(implementedBy(cls))[0]
390	for field_name in getFieldNames(iface):
391	setattr(cls, field_name, FieldProperty(iface[field_name]))
392	return cls
393
394	def get_current_principal():
395	"""Get the 'current' principal.
396
397	This method works without a request. Examining a request is the
398	regular (and recommended) way to get a principal involved
399	'currently'.
400
401	Use this method only if you really have no access to the current
402	request.
403
404	Returns ``None`` when no principal is involved (for instance
405	during tests).
406	"""
407	try:
408	principal = getInteraction().participations[0].principal
409	except NoInteraction:
410	return None
411	except IndexError: # No participations present
412	return None
413	return principal
414
415	def cmp_files(file_descr1, file_descr2):
416	"""Compare two files by their file descriptors.
417
418	Returns ``True`` if both are equal, ``False`` otherwise.
419	"""
420	file_descr1.seek(0)
421	file_descr2.seek(0)
422	while True:
423	b1 = file_descr1.read(BUFSIZE)
424	b2 = file_descr2.read(BUFSIZE)
425	if b1 != b2:
426	return False
427	if not b1:
428	return True
429
430	def string_from_bytes(number):
431	"""Turn a number into some textual representation.
432
433	Examples:
434
435	>>> string_from_bytes(1)
436	u'1 byte(s)'
437
438	>>> string_from_bytes(1025)
439	u'1 KB'
440
441	>>> string_from_bytes(1.5 * 1024*1024)
442	u'1.50 MB'
443
444	>>> string_from_bytes(673.286 * 1024**3)
445	u'673.29 GB'
446
447	"""
448	if number < 1024:
449	return u'%s byte(s)' % (str(number),)
450	elif number < 1024**2:
451	return u'%s KB' % (number / 1024,)
452	elif number < 1024**3:
453	return u'%.2f MB' % (number / 1024**2,)
454	return u'%.2f GB' % (number / 1024**3,)
455
456	def file_size(file_like_obj):
457	"""Determine file size in most effective manner.
458
459	Returns the number of bytes in a file. This function works for
460	both, real files as well as file-like objects like cStringIO based
461	'files'.
462
463	Example:
464
465	>>> from cStringIO import StringIO
466	>>> file_size(StringIO('my file content'))
467	15
468
469	Please note that this function expects the file-like object passed
470	in to be at first reading position (it does no seek(0)) and that
471	when finished the file pointer might be at end of file.
472	"""
473	if hasattr(file_like_obj, 'fileno'):
474	return os.fstat(file_like_obj.fileno())[6]
475	file_like_obj.seek(0, 2) # seek to last position in file
476	return file_like_obj.tell()
477
478	def get_user_account(request):
479	"""Return local user account.
480	"""
481	principal_id = request.principal.id
482	authenticator = getUtility(IAuthenticatorPlugin, name='users')
483	account = authenticator.getAccount(principal_id)
484	return account
485
486	def iface_names(iface, omit=[], exclude_attribs=True, exclude_methods=True):
487	"""Get all attribute names of an interface.
488
489	Searches also base interfaces.
490
491	Names of fields that are pure attributes
492	(i.e. zope.interface.Attribute) or methods are excluded by
493	default.
494
495	Names of typical fields derived from zope.schema are included.
496
497	The `omit` paramter can give a list of names to exclude.
498
499	Returns an unsorted list of strings.
500	"""
501	ifaces = set((iface,))
502	# Collect all interfaces (also bases) recursively
503	while True:
504	ext_ifaces = set(ifaces)
505	for iface in ext_ifaces:
506	ext_ifaces = set.union(ext_ifaces, set(iface.getBases()))
507	if ext_ifaces == ifaces:
508	# No new interfaces found, list complete
509	break
510	ifaces = ext_ifaces
511	# Collect (filtered) names of collected interfaces
512	result = []
513	for iface in ifaces:
514	for name, descr in iface.namesAndDescriptions():
515	if name in omit:
516	continue
517	if exclude_attribs and descr.__class__ is Attribute:
518	continue
519	if exclude_methods and isinstance(descr, Method):
520	continue
521	result.append(name)
522	return result
523
524	def get_sorted_preferred(tuples_iterable, preferred_list):
525	"""Get a list of tuples (<TITLE>,<TOKEN>) with values in
526	`preferred_list` put in front.
527
528	The rest of the tuples iterable is returned in orginal order. This
529	is useful for putting default entries on top of (already sorted)
530	lists of choice values, for instance when sorting countries and
531	their code.
532
533	Sample:
534
535	We have a list of tuples with uppercase 'titles' and lowercase
536	'tokens'. This list is already sorted but we want certain values
537	of this list to show up before other values. For instance we want
538	to see the 'C' entry to come first.
539
540	>>> get_sorted_preferred([('A','a'), ('B','b'), ('C','c')],
541	... ['c'])
542	(('C', 'c'), ('A', 'a'), ('B', 'b'))
543
544	i.e. the entry with 'c' as second value moved to head of result.
545
546	We can also require multiple entries at head of list:
547
548	>>> get_sorted_preferred([('A','a'), ('B','b'), ('C','c')],
549	... ['b', 'c'])
550	(('B', 'b'), ('C', 'c'), ('A', 'a'))
551
552	We required the 'b' entry to come before the 'c' entry and then
553	the rest of the input list. That's what we got.
554
555	The result is returned as a tuple of tuples to keep order of values.
556	"""
557	result = [None for x in preferred_list]
558	for title, code in tuples_iterable:
559	if code in preferred_list:
560	index = preferred_list.index(code)
561	result[index] = (title, code)
562	else:
563	result.append((title, code))
564	return tuple(result)
565
566	def now(tz=None):
567	"""Get current datetime in timezone of `tz`.
568
569	If `tz`, a `tzinfo` instance, is None, UTC time is returned.
570
571	`tz` should be a timezone as defined in pytz.
572	"""
573	return to_timezone(datetime.datetime.utcnow(), tz=tz)
574
575	def to_timezone(dt, tz=None):
576	"""Shift datetime into timezone `tz`.
577
578	If datetime `dt` contains no `tzinfo` (i.e. it is 'naive'), it is
579	assumed to be UTC.
580
581	If no `tz` is given, shift to UTC is performed.
582
583	If `dt` is not a datetime.datetime, the input value is returned
584	unchanged.
585	"""
586	if not isinstance(dt, datetime.datetime):
587	return dt
588	if tz is None:
589	tz = pytz.utc
590	if dt.tzinfo is None:
591	dt = pytz.utc.localize(dt)
592	return tz.normalize(dt.tzinfo.normalize(dt).astimezone(tz))
593
594	def get_fileformat(path, bytestream=None):
595	"""Try to determine the file format of a given media file.
596
597	Although checks done here are not done very thoroughly, they make
598	no assumptions about the filetype by looking at its filename
599	extension or similar. Instead they check header data to comply
600	with common known rules (Magic Words).
601
602	If bytestream is not `None` the `path` is ignored.
603
604	Returns filetype as string (something like ``'jpg'``) if
605	file-format can be recognized, ``None`` else.
606
607	Tested recognized filetypes currently are `jpg`, `png`, and `pdf`.
608
609	More filetypes (though untested in waeup.kofa) are automatically
610	recognized because we deploy the stdlib `imghdr` library. See this
611	module's docs for a complete list of filetypes recognized.
612	"""
613	if path is None and bytestream is None:
614	return None
615
616	img_type = None
617	if bytestream is not None:
618	img_type = imghdr.what(path, bytestream)
619	else:
620	img_type = imghdr.what(path)
621	for name, replacement in (('jpeg', 'jpg'), ('tiff', 'tif')):
622	if img_type == name:
623	img_type = replacement
624	return img_type
625
626	def check_pdf(bytestream, file):
627	"""Tell whether a file or bytestream is a PDF file.
628
629	Works as a test/plugin for the stdlib `imghdr` library.
630	"""
631	if file is not None:
632	file.seek(0)
633	bytestream = file.read(4)
634	file.seek(0)
635
636	if bytestream.startswith('%PDF'):
637	return 'pdf'
638	return None
639
640	# register check_pdf as header check function with `imghdr`
641	if check_pdf not in imghdr.tests:
642	imghdr.tests.append(check_pdf)
643
644	def merge_csv_files(path1, path2):
645	"""Merge two CSV files into one (appending).
646
647	CSV data from `path2` will be merged into `path1` csv file. This
648	is a bit like 'appending' data from path2 to data from path1.
649
650	The path of the resulting temporary file will be returned.
651
652	In the result file data from `path2` will always come _after_ data
653	from `path1`.
654
655	Caution: It is the _callers_ responsibility to remove the
656	result file (which is created by tempfile.mkstemp) after usage.
657
658	This CSV file merging copes with different column orders in both
659	CSV files and even with different column sets in both files.
660
661	Also broken/empty CSV files can be handled.
662	"""
663	# sniff the col names
664	try:
665	row10 = csv.DictReader(open(path1, 'rb')).next()
666	except StopIteration:
667	row10 = dict()
668	try:
669	row20 = csv.DictReader(open(path2, 'rb')).next()
670	except StopIteration:
671	row20 = dict()
672	fieldnames = sorted(list(set(row10.keys() + row20.keys())))
673	# now read/write the real data
674	reader1 = csv.DictReader(open(path1, 'rb'))
675	reader2 = csv.DictReader(open(path2, 'rb'))
676	wp, tmp_path = tempfile.mkstemp()
677	writer = csv.DictWriter(os.fdopen(wp, 'wb'), fieldnames)
678	writer.writerow(dict((x,x) for x in fieldnames)) # header
679	for row in reader1:
680	writer.writerow(row)
681	for row in reader2:
682	writer.writerow(row)
683	return tmp_path

Note: See TracBrowser for help on using the repository browser.

Download in other formats: