Context navigation

source: main/waeup.kofa/trunk/src/waeup/kofa/datacenter.py @ 15250

Last change on this file since 15250 was 14680, checked in by Henrik Bettermann, 7 years ago
Do not count lines if file isn't a csv file.
Property svn:keywords set to `Id`
File size: 15.3 KB

Rev	Line
[7193]	1	## $Id: datacenter.py 14680 2017-04-24 13:21:51Z henrik $
	2	##
	3	## Copyright (C) 2011 Uli Fouquet & Henrik Bettermann
	4	## This program is free software; you can redistribute it and/or modify
	5	## it under the terms of the GNU General Public License as published by
	6	## the Free Software Foundation; either version 2 of the License, or
	7	## (at your option) any later version.
	8	##
	9	## This program is distributed in the hope that it will be useful,
	10	## but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	12	## GNU General Public License for more details.
	13	##
	14	## You should have received a copy of the GNU General Public License
	15	## along with this program; if not, write to the Free Software
	16	## Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
	17	##
[7819]	18	"""Kofa data center.
[4146]	19
	20	The waeup data center cares for management of upload data and provides
	21	tools for importing/exporting CSV data.
	22	"""
[10045]	23	import codecs
	24	import fnmatch
	25	import grok
[4146]	26	import os
[8512]	27	import re
[4896]	28	import shutil
[4146]	29	from datetime import datetime
[7568]	30	from zope.component import getUtility
[4883]	31	from zope.component.interfaces import ObjectEvent
[7811]	32	from waeup.kofa.interfaces import (IDataCenter, IDataCenterFile,
[7568]	33	IDataCenterStorageMovedEvent,
[7579]	34	IDataCenterConfig)
[9217]	35	from waeup.kofa.utils.batching import ExportJobContainer
[8634]	36	from waeup.kofa.utils.helpers import copy_filesystem_tree, merge_csv_files
[7811]	37	from waeup.kofa.utils.logger import Logger
[4146]	38
[8512]	39	#: Regular expression describing a logfile name with backup extension
[8516]	40	RE_LOGFILE_BACKUP_NAME = re.compile('^.+\.\d+$')
[8512]	41
[9217]	42	class DataCenter(grok.Container, Logger, ExportJobContainer):
[4146]	43	"""A data center contains CSV files.
	44	"""
[4669]	45	grok.implements(IDataCenter)
[4166]	46
[7811]	47	logger_name = 'waeup.kofa.${sitename}.datacenter'
[6578]	48	logger_filename = 'datacenter.log'
[4873]	49
[9589]	50	max_files = 100
	51
[4892]	52	def __init__(self, args, *kw):
	53	super(DataCenter, self).__init__(args, *kw)
[7579]	54	self.storage = getUtility(IDataCenterConfig)['path']
[4892]	55	self._createSubDirs()
[6286]	56
[4892]	57	def _createSubDirs(self):
	58	"""Create standard subdirs.
	59	"""
[8372]	60	for name in ['finished', 'unfinished', 'logs', 'deleted']:
[4892]	61	path = os.path.join(self.storage, name)
	62	if os.path.exists(path):
	63	continue
	64	os.mkdir(path)
	65	return
[6286]	66
[8395]	67	@property
	68	def deleted_path(self):
	69	"""Get the path for deleted object data.
	70	"""
	71	return os.path.join(self.storage, 'deleted')
	72
[9023]	73	def getPendingFiles(self, sort='name'):
[4146]	74	"""Get a list of files stored in `storage`.
[4574]	75
	76	Files are sorted by basename.
[4146]	77	"""
	78	result = []
	79	if not os.path.exists(self.storage):
	80	return result
[4574]	81	for filename in sorted(os.listdir(self.storage)):
[4146]	82	fullpath = os.path.join(self.storage, filename)
	83	if not os.path.isfile(fullpath):
	84	continue
[14679]	85	if not (filename.endswith('.csv') or filename.endswith('.gz')):
[9008]	86	continue
[4146]	87	result.append(DataCenterFile(fullpath))
[4858]	88	if sort == 'date':
	89	# sort results in newest-first order...
	90	result = sorted(result, key=lambda x: x.getTimeStamp(),
	91	reverse=True)
[4146]	92	return result
	93
[9074]	94	def getFinishedFiles(self):
[9023]	95	"""Get a list of files stored in `finished` subfolder of `storage`.
	96
[9074]	97	Files are unsorted.
[9023]	98	"""
	99	result = []
	100	finished_dir = os.path.join(self.storage, 'finished')
	101	if not os.path.exists(finished_dir):
	102	return result
[9589]	103	mtime = lambda f: os.stat(os.path.join(finished_dir, f)).st_mtime
	104	finished_files = [f for f in
	105	sorted(os.listdir(finished_dir), key=mtime, reverse=True)
	106	if fnmatch.fnmatch(f, '*.finished.csv')]
	107	for filename in finished_files[:self.max_files]:
[9023]	108	fullpath = os.path.join(finished_dir, filename)
	109	if not os.path.isfile(fullpath):
	110	continue
	111	if not filename.endswith('.csv'):
	112	continue
	113	result.append(DataCenterFile(fullpath, 'finished'))
	114	return result
	115
[8516]	116	def getLogFiles(self, exclude_backups=True):
[4908]	117	"""Get the files from logs/ subdir. Files are sorted by name.
[8516]	118
[9769]	119	By default backup logs ('app.log.1', etc.) and payments.log
	120	are excluded.
[4858]	121	"""
	122	result = []
[4908]	123	logdir = os.path.join(self.storage, 'logs')
	124	if not os.path.exists(logdir):
	125	os.mkdir(logdir)
	126	for name in sorted(os.listdir(logdir)):
	127	if not os.path.isfile(os.path.join(logdir, name)):
[4858]	128	continue
[9769]	129	if name == 'payments.log':
	130	continue
[8516]	131	if exclude_backups == True and RE_LOGFILE_BACKUP_NAME.match(name):
	132	continue
[4858]	133	result.append(
[4908]	134	LogFile(os.path.join(self.storage, 'logs', name)))
[4858]	135	return result
[6286]	136
[4190]	137	def setStoragePath(self, path, move=False, overwrite=False):
[4146]	138	"""Set the path where to store files.
	139	"""
	140	path = os.path.abspath(path)
[4190]	141	not_copied = []
[4152]	142	if not os.path.exists(path):
	143	raise ValueError('The path given does not exist: %s' % path)
[4173]	144	if move is True:
[7186]	145	not_copied = copy_filesystem_tree(self.storage, path,
[4190]	146	overwrite=overwrite)
[4146]	147	self.storage = path
[4892]	148	self._createSubDirs()
[4883]	149	grok.notify(DataCenterStorageMovedEvent(self))
[4190]	150	return not_copied
[4146]	151
[4896]	152	def _moveFile(self, source, dest):
	153	"""Move file source to dest preserving ctime, mtime, etc.
	154	"""
	155	if not os.path.exists(source):
	156	self.logger.warn('No such source path: %s' % source)
	157	return
	158	if source == dest:
	159	return
	160	shutil.copyfile(source, dest)
	161	shutil.copystat(source, dest)
	162	os.unlink(source)
[6286]	163
[8634]	164	def _appendCSVFile(self, source, dest):
	165	"""Append data from CSV file `source` to data from CSV file `dest`.
	166
	167	The `source` file is deleted afterwards.
	168	"""
	169	if not os.path.exists(dest):
	170	return self._moveFile(source, dest)
	171	if not os.path.exists(source):
	172	self.logger.warn('No such source path: %s' % source)
	173	return
	174	if source == dest:
	175	return
	176	result_path = merge_csv_files(dest, source)
[8905]	177	os.chmod(result_path, 0664)
[8634]	178	self._moveFile(result_path, dest)
	179	os.unlink(source)
	180
[4896]	181	def distProcessedFiles(self, successful, source_path, finished_file,
[4996]	182	pending_file, mode='create', move_orig=True):
[4896]	183	"""Put processed files into final locations.
[4858]	184
[4896]	185	``successful`` is a boolean that tells, whether processing was
	186	successful.
	187
	188	``source_path``: path to file that was processed.
	189
	190	``finished_file``, ``pending_file``: paths to the respective
	191	generated .pending and .finished file. The .pending file path
	192	may be ``None``.
	193
[4904]	194	If finished file is placed in a location outside the local
	195	storage dir, the complete directory is removed
[7933]	196	afterwards. Regular processors should put their stuff in
[4904]	197	dedicated temporary dirs.
[6286]	198
[4896]	199	See datacenter.txt for more info about how this works.
	200	"""
	201	basename = os.path.basename(source_path)
	202	pending_name = basename
	203	pending = False
	204	finished_dir = os.path.join(self.storage, 'finished')
	205	unfinished_dir = os.path.join(self.storage, 'unfinished')
	206
	207	if basename.endswith('.pending.csv'):
[4996]	208	maybe_basename = "%s.csv" % basename.rsplit('.', 3)[0]
[4896]	209	maybe_src = os.path.join(unfinished_dir, maybe_basename)
	210	if os.path.isfile(maybe_src):
	211	basename = maybe_basename
	212	pending = True
[6286]	213
[4896]	214	base, ext = os.path.splitext(basename)
[4996]	215	finished_name = "%s.%s.finished%s" % (base, mode, ext)
[4896]	216	if not pending:
[4996]	217	pending_name = "%s.%s.pending%s" % (base, mode, ext)
[4896]	218
	219	# Put .pending and .finished file into respective places...
	220	pending_dest = os.path.join(self.storage, pending_name)
	221	finished_dest = os.path.join(finished_dir, finished_name)
[8634]	222	self._appendCSVFile(finished_file, finished_dest)
[4896]	223	if pending_file is not None:
	224	self._moveFile(pending_file, pending_dest)
	225
	226	# Put source file into final location...
	227	finished_dest = os.path.join(finished_dir, basename)
	228	unfinished_dest = os.path.join(unfinished_dir, basename)
	229	if successful and not pending:
	230	self._moveFile(source_path, finished_dest)
	231	elif successful and pending:
	232	self._moveFile(unfinished_dest, finished_dest)
	233	os.unlink(source_path)
	234	elif not successful and not pending:
	235	self._moveFile(source_path, unfinished_dest)
[4904]	236
	237	# If finished and pending-file were created in a location
	238	# outside datacenter storage, we remove it.
[4906]	239	maybe_temp_dir = os.path.dirname(finished_file)
[4904]	240	if os.path.commonprefix(
[4906]	241	[self.storage, maybe_temp_dir]) != self.storage:
	242	shutil.rmtree(maybe_temp_dir)
[4896]	243	return
	244
[8512]	245	def _logfiles(self, basename):
	246	"""Get sorted logfiles starting with `basename`.
	247	"""
	248	def numerical_suffix(name):
	249	# return numerical suffix in `name` as number or 0.
	250	suffix = name.rsplit('.', 1)[-1]
	251	try:
	252	return int(suffix)
	253	except ValueError:
	254	return 0
	255	pass
	256	files = [basename,]
	257	for name in os.listdir(os.path.join(self.storage, 'logs')):
[8516]	258	if RE_LOGFILE_BACKUP_NAME.match(name):
[8725]	259	if name.rsplit('.', 1)[0] == basename:
	260	files.append(name)
[8592]	261	return sorted(files, key=numerical_suffix, reverse=True)
[6286]	262
[8512]	263	def queryLogfiles(self, basename, query=None, limit=0, start=0):
	264	"""Search `query` in all logfiles starting with `basename`.
	265
	266	Returns an iterator of those lines in logfiles starting with
	267	`basename` that match `query`. If you want the result as a
	268	list, simply list() the iterator.
	269
	270	All logfiles with name `basename` and maybe some numerical
	271	extension ('.1', '.2', ...) are searched for the `query` term
[8592]	272	in correct chronological order. So, if you ask for a basename 'app.log',
	273	then any file named 'app2.log', 'app.log.1', 'app.log',
[8512]	274	etc. will be searched in that order.
	275
	276	The `query` is expected to be a string containing a regular
	277	expression.
	278
	279	If `limit` is set to some numerical value, at most this number
	280	of lines is returned.
	281
	282	With `start` you can give the number of first matched line to
	283	return. `start` is zero-based, i.e. the first match has number
	284	0, the scond one 1, etc.
	285
	286	Together with `limit` this allows some basic
	287	batching. Please keep in mind that batching might give
	288	unpredictable results, when logfiles change between two
	289	requests. This is not a problem when only one file is searched
	290	and changes include only appending new log messages.
	291
	292	Matches are found per line only (no multiline matches).
	293
[10045]	294	Result lines are returned as unicode instances decoded from
	295	UTF-8 encoding. This means that logfiles must provide UTF-8
	296	encoding for umlauts etc. if these should be rendered
	297	properly. The returned unicode lines can be fed to page
	298	templates even if they contain non-ASCII characters.
	299
[8512]	300	This method raises ValueError if some basic condition is not
	301	met, for instance if the given query string is not a valid
	302	regular expression.
	303
	304	Please note, that this exception will happen not before you
	305	really fetch a result line.
	306	"""
	307	try:
	308	re_query = re.compile(query)
	309	except:
	310	raise ValueError('Invalid query string: %s' % query)
	311
	312	basename = basename.replace('/../', '')
	313	files = self._logfiles(basename)
	314
	315	# Search the log files
	316	num = 0
	317	for name in files:
	318	path = os.path.join(self.storage, 'logs', name)
	319	if not os.path.isfile(path):
	320	continue
[10045]	321	for line in codecs.open(path, 'rb', 'utf-8'):
[8513]	322	if not re_query.search(line):
[8512]	323	continue
	324	num += 1
	325	if (num - 1) < start:
	326	continue
	327	yield line
	328
	329	if limit and (num - limit >= start):
	330	raise StopIteration
	331	pass
	332
[4146]	333	class DataCenterFile(object):
	334	"""A description of a file stored in data center.
	335	"""
[4166]	336	grok.implements(IDataCenterFile)
[6286]	337
[9023]	338	def __init__(self, context, folder_name=''):
[4146]	339	self.context = context
	340	self.name = os.path.basename(self.context)
[9023]	341	self.rel_path = os.path.join(folder_name, self.name)
[4146]	342	self.size = self.getSize()
	343	self.uploaddate = self.getDate()
[4858]	344	self.lines = self.getLinesNumber()
[4146]	345
	346	def getDate(self):
	347	"""Get a human readable datetime representation.
	348	"""
[9340]	349	date = datetime.fromtimestamp(os.path.getmtime(self.context))
[6827]	350	return date.strftime("%Y-%m-%d %H:%M:%S")
[4146]	351
[4858]	352	def getTimeStamp(self):
	353	"""Get a (machine readable) timestamp.
	354	"""
[9340]	355	return os.path.getmtime(self.context)
[6286]	356
[4146]	357	def getSize(self):
	358	"""Get a human readable file size.
	359	"""
	360	bytesize = os.path.getsize(self.context)
	361	size = "%s bytes" % bytesize
	362	units = ['kb', 'MB', 'GB']
	363	for power, unit in reversed(list(enumerate(units))):
	364	power += 1
	365	if bytesize >= 1024 ** power:
	366	size = "%.2f %s" % (bytesize/(1024.0**power), unit)
	367	break
	368	return size
	369
[4858]	370	def getLinesNumber(self):
	371	"""Get number of lines.
	372	"""
	373	num = 0
[14680]	374	if not self.context.title().lower().endswith('csv'):
	375	return 1
[4858]	376	for line in open(self.context, 'rb'):
	377	num += 1
	378	return num
[6286]	379
[4858]	380	class LogFile(DataCenterFile):
	381	"""A description of a log file.
	382	"""
	383	def __init__(self, context):
	384	super(LogFile, self).__init__(context)
	385	self._markers = dict()
	386	self._parsed = False
	387	self.userid = self.getUserId()
	388	self.mode = self.getMode()
	389	self.stats = self.getStats()
	390	self.source = self.getSourcePath()
	391
	392	def _parseFile(self, maxline=10):
	393	"""Find markers in a file.
	394	"""
	395	if self._parsed:
	396	return
[10045]	397	for line in codecs.open(self.context, 'rb', 'utf-8'):
[4858]	398	line = line.strip()
	399	if not ':' in line:
	400	continue
	401	name, text = line.split(':', 1)
	402	self._markers[name.lower()] = text
	403	self._parsed = True
	404	return
	405
	406	def _getMarker(self, marker):
	407	marker = marker.lower()
	408	if not self._parsed:
	409	self._parseFile()
	410	if marker in self._markers.keys():
	411	return self._markers[marker]
[6286]	412
[4858]	413	def getUserId(self):
	414	return self._getMarker('user') or '<UNKNOWN>'
	415
	416	def getMode(self):
	417	return self._getMarker('mode') or '<NOT SET>'
	418
	419	def getStats(self):
	420	return self._getMarker('processed') or '<Info not avail.>'
	421
	422	def getSourcePath(self):
	423	return self._getMarker('source') or None
[4883]	424
[4961]	425
[4883]	426	class DataCenterStorageMovedEvent(ObjectEvent):
	427	"""An event fired, when datacenter storage moves.
	428	"""
	429	grok.implements(IDataCenterStorageMovedEvent)

Note: See TracBrowser for help on using the repository browser.

Download in other formats: