Context navigation

source: main/waeup.kofa/trunk/src/waeup/kofa/utils/helpers.py @ 8633

Last change on this file since 8633 was 8633, checked in by uli, 12 years ago
Provide the more robust CSV file merging.
Property svn:keywords set to `Id`
File size: 21.5 KB

Rev	Line
[7196]	1	## $Id: helpers.py 8633 2012-06-06 01:11:19Z uli $
	2	##
	3	## Copyright (C) 2011 Uli Fouquet & Henrik Bettermann
	4	## This program is free software; you can redistribute it and/or modify
	5	## it under the terms of the GNU General Public License as published by
	6	## the Free Software Foundation; either version 2 of the License, or
	7	## (at your option) any later version.
	8	##
	9	## This program is distributed in the hope that it will be useful,
	10	## but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	12	## GNU General Public License for more details.
	13	##
	14	## You should have received a copy of the GNU General Public License
	15	## along with this program; if not, write to the Free Software
	16	## Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
	17	##
[7819]	18	"""General helper functions for Kofa.
[4188]	19	"""
[8631]	20	import csv
[8185]	21	import datetime
[8466]	22	import imghdr
[4188]	23	import os
[8185]	24	import pytz
[4375]	25	import re
[4188]	26	import shutil
[8631]	27	import tempfile
[5731]	28	import grok
[5848]	29	from cStringIO import StringIO
	30	from docutils.core import publish_string
[7943]	31	from zope.component import getUtility
[5731]	32	from zope.component.interfaces import IFactory
[5734]	33	from zope.interface import implementedBy
[7941]	34	from zope.interface.interface import Method, Attribute
[6071]	35	from zope.schema import getFieldNames
	36	from zope.schema.fieldproperty import FieldProperty
[6372]	37	from zope.security.interfaces import NoInteraction
	38	from zope.security.management import getInteraction
[7175]	39	from zope.pluggableauth.interfaces import IAuthenticatorPlugin
[4188]	40
[6503]	41	BUFSIZE = 8 * 1024
[6372]	42
[7186]	43	def remove_file_or_directory(filepath):
[4188]	44	"""Remove a file or directory.
[5738]	45
	46	Different to :func:`shutil.rmtree` we also accept not existing
	47	paths (returning silently) and if a dir turns out to be a regular
	48	file, we remove that.
[4188]	49	"""
	50	filepath = os.path.abspath(filepath)
	51	if not os.path.exists(filepath):
	52	return
	53	if os.path.isdir(filepath):
	54	shutil.rmtree(filepath)
	55	else:
	56	os.unlink(filepath)
	57	return
	58
[7186]	59	def copy_filesystem_tree(src, dst, overwrite=False, del_old=False):
[4188]	60	"""Copy contents of directory src to directory dst.
	61
	62	Both directories must exists.
	63
	64	If `overwrite` is true, any same named objects will be
	65	overwritten. Otherwise these files will not be touched.
	66
	67	If `del_old` is true, copied files and directories will be removed
	68	from the src directory.
	69
	70	This functions returns a list of non-copied files.
	71
	72	Unix hidden files and directories (starting with '.') are not
	73	processed by this function.
	74	"""
	75	if not os.path.exists(src):
	76	raise ValueError('source path does not exist: %s' % src)
	77	if not os.path.exists(dst):
	78	raise ValueError('destination path does not exist: %s' % dst)
	79	if not os.path.isdir(src):
	80	raise ValueError('source path is not a directory: %s' % src)
	81	if not os.path.isdir(dst):
	82	raise ValueError('destination path is not a directory: %s' % dst)
	83	not_copied = []
	84	for item in os.listdir(src):
	85	if item.startswith('.'):
	86	continue # We do not copy hidden stuff...
	87	itemsrc = os.path.join(src, item)
	88	itemdst = os.path.join(dst, item)
	89
	90	if os.path.exists(itemdst):
	91	if overwrite is True:
[7186]	92	remove_file_or_directory(itemdst)
[4188]	93	else:
	94	not_copied.append(item)
	95	continue
[6113]	96
[4188]	97	if os.path.isdir(itemsrc):
	98	shutil.copytree(itemsrc, itemdst)
	99	else:
	100	shutil.copy2(itemsrc, itemdst)
	101	if del_old:
[7186]	102	remove_file_or_directory(itemsrc)
[4188]	103	return not_copied
[4375]	104
	105
[7186]	106	def get_inner_HTML_part(html_code):
[4375]	107	"""Return the 'inner' part of a complete HTML snippet.
	108
	109	If there is a form part, get this.
	110
	111	If there is no form part, try to return the body part contents.
	112
	113	If there is no body, return as-is.
[5738]	114
	115	Let's see how that works. If we deliver some doc with form, we
	116	will get that form only:
	117
	118	>>> doc = '<html><form>My Form</form>Outside the form</html>'
[7186]	119	>>> get_inner_HTML_part(doc)
[5738]	120	'<form>My Form</form>'
	121
	122	No form? Then seek for a body part and get the contents:
	123
	124	>>> doc = '<html><body>My Body</body>Trailing Trash</html>'
[7186]	125	>>> get_inner_HTML_part(doc)
[5738]	126	'My Body'
	127
	128	If none of these is included, return what we got:
	129
	130	>>> doc = '<html>without body nor form</html>'
[7186]	131	>>> get_inner_HTML_part(doc)
[5738]	132	'<html>without body nor form</html>'
	133
[4375]	134	"""
	135
	136	try:
[5738]	137	result = re.match('^.+(<form[^\>]>.</form>).+$', html_code,
[4375]	138	re.DOTALL).groups()[0]
	139	return result
	140	except AttributeError:
	141	# No <form> part included
	142	try:
	143	result = re.match('^.+<body[^\>]>(.)</body>.*$', html_code,
	144	re.DOTALL).groups()[0]
	145	return result
	146	except AttributeError:
	147	# No <form> and no <body> tag...
	148	pass
	149	return html_code
	150
[5731]	151	class FactoryBase(grok.GlobalUtility):
	152	"""A factory for things.
	153
	154	This is a baseclass for easier creation of factories. Factories
	155	are utilities that are registered under a certain name and return
	156	instances of certain classes when called.
	157
[7811]	158	In :mod:`waeup.kofa` we use factories extensively for
[7933]	159	batching. While processing a batch some processors looks up a
[5731]	160	factory to create real-world instances that then get filled with
	161	data from imported CSV files.
	162
	163	To get rid of reimplementing the same stuff over and over again,
	164	most notably the methods defined here, we offer this base class
	165	(which will not be registered as a factory itself).
	166
	167	Real factories can then be created like this:
	168
	169	>>> import grok
[7811]	170	>>> from waeup.kofa.utils.helpers import FactoryBase
[5731]	171	>>> class MyObject(object):
	172	... # Some class we want to get instances of.
	173	... pass
	174	>>> class MyObjectFactory(FactoryBase):
	175	... # This is the factory for MyObject instances
[7811]	176	... grok.name(u'waeup.kofa.factory.MyObject')
[5731]	177	... factory = MyObject
	178
	179	That's it. It is essential to set the ``factory`` attribute, which
	180	will determine the class of which instances should be created when
	181	called. The given name must even be unique amongst all utilities
	182	registered during runtime. While you can pick any name you like
[7811]	183	you might want to prepend ``waeup.kofa.factory.`` to the name
[5731]	184	string to make sure it does not clash with names of other
	185	utilities one day.
	186
	187	Before all this works we have to grok the baseclass once and our
	188	freshly defined factory. This executes all the component
	189	registration stuff we don't want to do ourselves. In daily use
[7811]	190	this is done automatically on startup of a :mod:`waeup.kofa`
[5731]	191	system.
[6113]	192
[7811]	193	>>> grok.testing.grok('waeup.kofa.utils.helpers')
[5731]	194	>>> grok.testing.grok_component(
	195	... 'MyObjectFactory', MyObjectFactory
	196	... )
	197	True
	198
[7933]	199	After grokking we (and processors) can create objects without
[5731]	200	knowing about the location of the real class definition, just by
	201	the factory name:
	202
	203	>>> from zope.component import createObject
[7811]	204	>>> obj = createObject('waeup.kofa.factory.MyObject')
[5731]	205	>>> isinstance(obj, MyObject)
	206	True
	207
	208	We can also use the regular utility lookups to find our new
	209	factory:
	210
	211	>>> from zope.component import getUtility
	212	>>> from zope.component.interfaces import IFactory
	213	>>> factory = getUtility(
[7811]	214	... IFactory, name='waeup.kofa.factory.MyObject'
[5731]	215	... )
	216	>>> isinstance(factory, MyObjectFactory)
	217	True
	218
	219	And this factory generates `MyObject` instances:
	220
	221	>>> obj = factory()
	222	>>> isinstance(obj, MyObject)
	223	True
	224
	225	"""
	226	grok.baseclass() # Do not grok this class, do not register us.
	227	grok.implements(IFactory)
	228	# You can override any of the following attributes in derived
	229	# classes. The `grok.name` setting must even be set to some
	230	# unique value.
	231	grok.name(u'waeup.Factory')
	232	title = u"Create instances of ``factory``.",
	233	description = u"This factory instantiates new applicant instances."
	234	factory = None
	235
	236	def __call__(self, args, *kw):
	237	"""The main factory function.
	238
	239	Returns an instance of the requested object.
	240	"""
	241	return self.factory()
	242
	243	def getInterfaces(self):
	244	# Required by IFactory
	245	return implementedBy(self.factory)
[5848]	246
	247	def ReST2HTML_w_warnings(source_string):
	248	"""Convert a reStructuredText string to HTML preserving warnings.
	249
	250	Returns a tuple ``(<HTML_CODE>, <WARNINGS>)``, both being
	251	strings. Where ``<HTML_CODE>`` is the HTML code generated from the
[5876]	252	source string (in unicode), ``<WARNINGS>`` is a string containing
	253	any warning messages or ``None``.
[6113]	254
[5848]	255	Regular multi-line ReStructuredText strings will be returned as
	256	HTML code:
	257
[7811]	258	>>> from waeup.kofa.utils.helpers import ReST2HTML
[5848]	259	>>> source = '''
	260	... Headline
	261	... ========
	262	...
	263	... - A list item
	264	... - Another item
	265	...
	266	... Thanks for watching!
	267	... '''
	268	>>> html, warnings = ReST2HTML_w_warnings(source)
	269	>>> print html
	270	<div class="document" id="headline">
	271	<h1 class="title">Headline</h1>
	272	<BLANKLINE>
	273	<ul class="simple">
	274	<li>A list item</li>
	275	<li>Another item</li>
	276	</ul>
	277	<p>Thanks for watching!</p>
	278	</div>
	279
	280	Here no warnings happened, so the `warnings` are ``None``:
	281
	282	>>> warnings is None
	283	True
[6113]	284
[5848]	285	If warnings happen then they can be retrieved in the returned
	286	``warnings``. We try to render an erraneous document:
	287
	288	>>> source = '''
	289	... Headline
	290	... ======
	291	...
	292	... Thanks for watching!
	293	... '''
	294	>>> html, warnings = ReST2HTML_w_warnings(source)
	295	>>> print html
	296	<div class="document" id="headline">
	297	<h1 class="title">Headline</h1>
	298	<BLANKLINE>
	299	<p>Thanks for watching!</p>
	300	</div>
	301
	302	>>> print warnings
	303	<string>:3: (WARNING/2) Title underline too short.
	304	<BLANKLINE>
	305	Headline
	306	======
	307	<BLANKLINE>
	308
	309	As you can see, the warnings are not displayed inline the document
	310	but can be retrieved from the returned warnings, which is a string
	311	or ``None``.
	312	"""
	313	warnings = StringIO()
	314	fulldoc = publish_string(
	315	source_string, writer_name='html4css1',
	316	settings_overrides={
	317	'report_level': 0,
	318	'warning_stream': warnings,
	319	})
	320	warnings.seek(0)
	321	warning_msgs = warnings.read()
	322	if warning_msgs:
	323	# Render again, this time with no warnings inline...
	324	fulldoc = publish_string(
	325	source_string, writer_name='html4css1',
	326	settings_overrides={
	327	'report_level': 10000,
	328	'halt_level': 10000,
	329	'warning_stream': warnings,
	330	})
	331	if warning_msgs == '':
	332	warning_msgs = None
[7186]	333	result = get_inner_HTML_part(fulldoc).strip()
[5876]	334	if not isinstance(result, unicode):
	335	result = result.decode('utf-8')
	336	return result, warning_msgs
[5848]	337
	338	def ReST2HTML(source_string):
	339	"""Render a string containing ReStructuredText to HTML.
	340
	341	Any warnings about too short headings, etc. are silently
	342	discarded. Use :func:`ReST2HTML_w_warnings` if you want to get any
	343	warnings.
	344
[5876]	345	The returned string will be unicode.
[6113]	346
[5848]	347	A regular document will be rendered like this:
	348
	349	>>> source = '''
	350	... Headline
	351	... ========
	352	...
	353	... Thanks for watching!
	354	... '''
	355	>>> html = ReST2HTML(source)
	356	>>> print html
	357	<div class="document" id="headline">
	358	<h1 class="title">Headline</h1>
	359	<BLANKLINE>
	360	<p>Thanks for watching!</p>
	361	</div>
	362
	363	A document with markup problems (here: the underline is too short)
	364	will look similar:
	365
	366	>>> source = '''
	367	... Headline
	368	... ======
	369	...
	370	... Thanks for watching!
	371	... '''
	372	>>> html = ReST2HTML(source)
	373	>>> print html
	374	<div class="document" id="headline">
	375	<h1 class="title">Headline</h1>
	376	<BLANKLINE>
	377	<p>Thanks for watching!</p>
	378	</div>
[6113]	379
[5848]	380	"""
	381	html, warnings = ReST2HTML_w_warnings(source_string)
	382	return html
[6071]	383
	384	def attrs_to_fields(cls):
	385	"""Turn the attributes of a class into FieldProperty instances.
[6113]	386
	387	With Python >= 2.6 we can even use this function as a class decorator.
[6071]	388	"""
	389	iface = list(implementedBy(cls))[0]
	390	for field_name in getFieldNames(iface):
	391	setattr(cls, field_name, FieldProperty(iface[field_name]))
	392	return cls
[6372]	393
	394	def get_current_principal():
	395	"""Get the 'current' principal.
	396
	397	This method works without a request. Examining a request is the
	398	regular (and recommended) way to get a principal involved
	399	'currently'.
	400
	401	Use this method only if you really have no access to the current
	402	request.
	403
	404	Returns ``None`` when no principal is involved (for instance
	405	during tests).
	406	"""
	407	try:
	408	principal = getInteraction().participations[0].principal
	409	except NoInteraction:
	410	return None
	411	except IndexError: # No participations present
	412	return None
	413	return principal
[6503]	414
	415	def cmp_files(file_descr1, file_descr2):
	416	"""Compare two files by their file descriptors.
	417
	418	Returns ``True`` if both are equal, ``False`` otherwise.
	419	"""
[6531]	420	file_descr1.seek(0)
	421	file_descr2.seek(0)
[6503]	422	while True:
	423	b1 = file_descr1.read(BUFSIZE)
	424	b2 = file_descr2.read(BUFSIZE)
	425	if b1 != b2:
	426	return False
	427	if not b1:
	428	return True
[7078]	429
	430	def string_from_bytes(number):
	431	"""Turn a number into some textual representation.
	432
	433	Examples:
	434
	435	>>> string_from_bytes(1)
	436	u'1 byte(s)'
	437
	438	>>> string_from_bytes(1025)
	439	u'1 KB'
	440
	441	>>> string_from_bytes(1.5 * 1024*1024)
	442	u'1.50 MB'
	443
	444	>>> string_from_bytes(673.286 * 1024**3)
	445	u'673.29 GB'
	446
	447	"""
	448	if number < 1024:
	449	return u'%s byte(s)' % (str(number),)
	450	elif number < 1024**2:
	451	return u'%s KB' % (number / 1024,)
	452	elif number < 1024**3:
	453	return u'%.2f MB' % (number / 1024**2,)
	454	return u'%.2f GB' % (number / 1024**3,)
[7079]	455
	456	def file_size(file_like_obj):
	457	"""Determine file size in most effective manner.
	458
	459	Returns the number of bytes in a file. This function works for
	460	both, real files as well as file-like objects like cStringIO based
	461	'files'.
	462
	463	Example:
	464
	465	>>> from cStringIO import StringIO
	466	>>> file_size(StringIO('my file content'))
	467	15
	468
	469	Please note that this function expects the file-like object passed
	470	in to be at first reading position (it does no seek(0)) and that
	471	when finished the file pointer might be at end of file.
	472	"""
	473	if hasattr(file_like_obj, 'fileno'):
	474	return os.fstat(file_like_obj.fileno())[6]
	475	file_like_obj.seek(0, 2) # seek to last position in file
	476	return file_like_obj.tell()
[7175]	477
	478	def get_user_account(request):
	479	"""Return local user account.
	480	"""
	481	principal_id = request.principal.id
[7234]	482	authenticator = getUtility(IAuthenticatorPlugin, name='users')
	483	account = authenticator.getAccount(principal_id)
[7175]	484	return account
[7941]	485
	486	def iface_names(iface, omit=[], exclude_attribs=True, exclude_methods=True):
	487	"""Get all attribute names of an interface.
	488
	489	Searches also base interfaces.
	490
	491	Names of fields that are pure attributes
	492	(i.e. zope.interface.Attribute) or methods are excluded by
	493	default.
	494
	495	Names of typical fields derived from zope.schema are included.
	496
	497	The `omit` paramter can give a list of names to exclude.
	498
	499	Returns an unsorted list of strings.
	500	"""
[8370]	501	ifaces = set((iface,))
	502	# Collect all interfaces (also bases) recursively
	503	while True:
	504	ext_ifaces = set(ifaces)
	505	for iface in ext_ifaces:
	506	ext_ifaces = set.union(ext_ifaces, set(iface.getBases()))
	507	if ext_ifaces == ifaces:
	508	# No new interfaces found, list complete
	509	break
	510	ifaces = ext_ifaces
	511	# Collect (filtered) names of collected interfaces
[7941]	512	result = []
[8370]	513	for iface in ifaces:
	514	for name, descr in iface.namesAndDescriptions():
	515	if name in omit:
	516	continue
	517	if exclude_attribs and descr.__class__ is Attribute:
	518	continue
	519	if exclude_methods and isinstance(descr, Method):
	520	continue
	521	result.append(name)
[7941]	522	return result
[7968]	523
	524	def get_sorted_preferred(tuples_iterable, preferred_list):
	525	"""Get a list of tuples (<TITLE>,<TOKEN>) with values in
	526	`preferred_list` put in front.
	527
	528	The rest of the tuples iterable is returned in orginal order. This
	529	is useful for putting default entries on top of (already sorted)
	530	lists of choice values, for instance when sorting countries and
	531	their code.
	532
	533	Sample:
	534
	535	We have a list of tuples with uppercase 'titles' and lowercase
	536	'tokens'. This list is already sorted but we want certain values
	537	of this list to show up before other values. For instance we want
	538	to see the 'C' entry to come first.
	539
	540	>>> get_sorted_preferred([('A','a'), ('B','b'), ('C','c')],
	541	... ['c'])
	542	(('C', 'c'), ('A', 'a'), ('B', 'b'))
	543
	544	i.e. the entry with 'c' as second value moved to head of result.
	545
	546	We can also require multiple entries at head of list:
	547
	548	>>> get_sorted_preferred([('A','a'), ('B','b'), ('C','c')],
	549	... ['b', 'c'])
	550	(('B', 'b'), ('C', 'c'), ('A', 'a'))
	551
	552	We required the 'b' entry to come before the 'c' entry and then
	553	the rest of the input list. That's what we got.
	554
	555	The result is returned as a tuple of tuples to keep order of values.
	556	"""
	557	result = [None for x in preferred_list]
	558	for title, code in tuples_iterable:
	559	if code in preferred_list:
	560	index = preferred_list.index(code)
	561	result[index] = (title, code)
	562	else:
	563	result.append((title, code))
	564	return tuple(result)
[8185]	565
	566	def now(tz=None):
	567	"""Get current datetime in timezone of `tz`.
	568
	569	If `tz`, a `tzinfo` instance, is None, UTC time is returned.
	570
	571	`tz` should be a timezone as defined in pytz.
	572	"""
	573	return to_timezone(datetime.datetime.utcnow(), tz=tz)
	574
	575	def to_timezone(dt, tz=None):
	576	"""Shift datetime into timezone `tz`.
	577
	578	If datetime `dt` contains no `tzinfo` (i.e. it is 'naive'), it is
	579	assumed to be UTC.
	580
	581	If no `tz` is given, shift to UTC is performed.
[8192]	582
	583	If `dt` is not a datetime.datetime, the input value is returned
	584	unchanged.
[8185]	585	"""
[8192]	586	if not isinstance(dt, datetime.datetime):
	587	return dt
[8185]	588	if tz is None:
	589	tz = pytz.utc
	590	if dt.tzinfo is None:
	591	dt = pytz.utc.localize(dt)
	592	return tz.normalize(dt.tzinfo.normalize(dt).astimezone(tz))
[8466]	593
	594	def get_fileformat(path, bytestream=None):
	595	"""Try to determine the file format of a given media file.
	596
	597	Although checks done here are not done very thoroughly, they make
	598	no assumptions about the filetype by looking at its filename
	599	extension or similar. Instead they check header data to comply
	600	with common known rules (Magic Words).
	601
	602	If bytestream is not `None` the `path` is ignored.
	603
	604	Returns filetype as string (something like ``'jpg'``) if
	605	file-format can be recognized, ``None`` else.
	606
	607	Tested recognized filetypes currently are `jpg`, `png`, and `pdf`.
	608
	609	More filetypes (though untested in waeup.kofa) are automatically
	610	recognized because we deploy the stdlib `imghdr` library. See this
	611	module's docs for a complete list of filetypes recognized.
	612	"""
	613	if path is None and bytestream is None:
	614	return None
	615
	616	img_type = None
	617	if bytestream is not None:
	618	img_type = imghdr.what(path, bytestream)
	619	else:
	620	img_type = imghdr.what(path)
	621	for name, replacement in (('jpeg', 'jpg'), ('tiff', 'tif')):
	622	if img_type == name:
	623	img_type = replacement
	624	return img_type
	625
	626	def check_pdf(bytestream, file):
	627	"""Tell whether a file or bytestream is a PDF file.
	628
	629	Works as a test/plugin for the stdlib `imghdr` library.
	630	"""
	631	if file is not None:
	632	file.seek(0)
	633	bytestream = file.read(4)
	634	file.seek(0)
	635
	636	if bytestream.startswith('%PDF'):
	637	return 'pdf'
	638	return None
	639
	640	# register check_pdf as header check function with `imghdr`
	641	if check_pdf not in imghdr.tests:
	642	imghdr.tests.append(check_pdf)
[8631]	643
	644	def merge_csv_files(path1, path2):
	645	"""Merge two CSV files into one (appending).
	646
	647	CSV data from `path2` will be merged into `path1` csv file. This
	648	is a bit like 'appending' data from path2 to data from path1.
	649
	650	The path of the resulting temporary file will be returned.
	651
	652	In the result file data from `path2` will always come _after_ data
	653	from `path1`.
	654
	655	Caution: It is the _callers_ responsibility to remove the
	656	result file (which is created by tempfile.mkstemp) after usage.
	657
	658	This CSV file merging copes with different column orders in both
	659	CSV files and even with different column sets in both files.
[8633]	660
	661	Also broken/empty CSV files can be handled.
[8631]	662	"""
	663	# sniff the col names
[8633]	664	try:
	665	row10 = csv.DictReader(open(path1, 'rb')).next()
	666	except StopIteration:
	667	row10 = dict()
	668	try:
	669	row20 = csv.DictReader(open(path2, 'rb')).next()
	670	except StopIteration:
	671	row20 = dict()
[8631]	672	fieldnames = sorted(list(set(row10.keys() + row20.keys())))
	673	# now read/write the real data
	674	reader1 = csv.DictReader(open(path1, 'rb'))
	675	reader2 = csv.DictReader(open(path2, 'rb'))
	676	wp, tmp_path = tempfile.mkstemp()
	677	writer = csv.DictWriter(os.fdopen(wp, 'wb'), fieldnames)
	678	writer.writerow(dict((x,x) for x in fieldnames)) # header
	679	for row in reader1:
	680	writer.writerow(row)
	681	for row in reader2:
	682	writer.writerow(row)
	683	return tmp_path

Note: See TracBrowser for help on using the repository browser.

Download in other formats: