source: main/waeup.kofa/trunk/src/waeup/kofa/datacenter.py @ 8757

Last change on this file since 8757 was 8725, checked in by uli, 13 years ago

Search only logfiles with correct basename, fix tests.

  • Property svn:keywords set to Id
File size: 13.6 KB
Line 
1## $Id: datacenter.py 8725 2012-06-14 16:47:57Z uli $
2##
3## Copyright (C) 2011 Uli Fouquet & Henrik Bettermann
4## This program is free software; you can redistribute it and/or modify
5## it under the terms of the GNU General Public License as published by
6## the Free Software Foundation; either version 2 of the License, or
7## (at your option) any later version.
8##
9## This program is distributed in the hope that it will be useful,
10## but WITHOUT ANY WARRANTY; without even the implied warranty of
11## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12## GNU General Public License for more details.
13##
14## You should have received a copy of the GNU General Public License
15## along with this program; if not, write to the Free Software
16## Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17##
18"""Kofa data center.
19
20The waeup data center cares for management of upload data and provides
21tools for importing/exporting CSV data.
22"""
23import os
24import re
25import shutil
26import grok
27from datetime import datetime
28from zope.component import getUtility
29from zope.component.interfaces import ObjectEvent
30from waeup.kofa.interfaces import (IDataCenter, IDataCenterFile,
31                                   IDataCenterStorageMovedEvent,
32                                   IDataCenterConfig)
33from waeup.kofa.utils.helpers import copy_filesystem_tree, merge_csv_files
34from waeup.kofa.utils.logger import Logger
35
36#: Regular expression describing a logfile name with backup extension
37RE_LOGFILE_BACKUP_NAME = re.compile('^.+\.\d+$')
38
39class DataCenter(grok.Container, Logger):
40    """A data center contains CSV files.
41    """
42    grok.implements(IDataCenter)
43
44    logger_name = 'waeup.kofa.${sitename}.datacenter'
45    logger_filename = 'datacenter.log'
46
47    def __init__(self, *args, **kw):
48        super(DataCenter, self).__init__(*args, **kw)
49        self.storage = getUtility(IDataCenterConfig)['path']
50        self._createSubDirs()
51
52    def _createSubDirs(self):
53        """Create standard subdirs.
54        """
55        for name in ['finished', 'unfinished', 'logs', 'deleted']:
56            path = os.path.join(self.storage, name)
57            if os.path.exists(path):
58                continue
59            os.mkdir(path)
60        return
61
62    @property
63    def deleted_path(self):
64        """Get the path for deleted object data.
65        """
66        return os.path.join(self.storage, 'deleted')
67
68    def getFiles(self, sort='name'):
69        """Get a list of files stored in `storage`.
70
71        Files are sorted by basename.
72        """
73        result = []
74        if not os.path.exists(self.storage):
75            return result
76        for filename in sorted(os.listdir(self.storage)):
77            fullpath = os.path.join(self.storage, filename)
78            if not os.path.isfile(fullpath):
79                continue
80            result.append(DataCenterFile(fullpath))
81        if sort == 'date':
82            # sort results in newest-first order...
83            result = sorted(result, key=lambda x: x.getTimeStamp(),
84                            reverse=True)
85        return result
86
87    def getLogFiles(self, exclude_backups=True):
88        """Get the files from logs/ subdir. Files are sorted by name.
89
90        By default backup logs ('app.log.1', etc.) are excluded.
91        """
92        result = []
93        logdir = os.path.join(self.storage, 'logs')
94        if not os.path.exists(logdir):
95            os.mkdir(logdir)
96        for name in sorted(os.listdir(logdir)):
97            if not os.path.isfile(os.path.join(logdir, name)):
98                continue
99            if exclude_backups == True and RE_LOGFILE_BACKUP_NAME.match(name):
100                continue
101            result.append(
102                LogFile(os.path.join(self.storage, 'logs', name)))
103        return result
104
105    def setStoragePath(self, path, move=False, overwrite=False):
106        """Set the path where to store files.
107        """
108        path = os.path.abspath(path)
109        not_copied = []
110        if not os.path.exists(path):
111            raise ValueError('The path given does not exist: %s' % path)
112        if move is True:
113            not_copied = copy_filesystem_tree(self.storage, path,
114                                            overwrite=overwrite)
115        self.storage = path
116        self._createSubDirs()
117        grok.notify(DataCenterStorageMovedEvent(self))
118        return not_copied
119
120    def _moveFile(self, source, dest):
121        """Move file source to dest preserving ctime, mtime, etc.
122        """
123        if not os.path.exists(source):
124            self.logger.warn('No such source path: %s' % source)
125            return
126        if source == dest:
127            return
128        shutil.copyfile(source, dest)
129        shutil.copystat(source, dest)
130        os.unlink(source)
131
132    def _appendCSVFile(self, source, dest):
133        """Append data from CSV file `source` to data from CSV file `dest`.
134
135        The `source` file is deleted afterwards.
136        """
137        if not os.path.exists(dest):
138            return self._moveFile(source, dest)
139        if not os.path.exists(source):
140            self.logger.warn('No such source path: %s' % source)
141            return
142        if source == dest:
143            return
144        result_path = merge_csv_files(dest, source)
145        self._moveFile(result_path, dest)
146        os.unlink(source)
147
148    def distProcessedFiles(self, successful, source_path, finished_file,
149                           pending_file, mode='create', move_orig=True):
150        """Put processed files into final locations.
151
152        ``successful`` is a boolean that tells, whether processing was
153        successful.
154
155        ``source_path``: path to file that was processed.
156
157        ``finished_file``, ``pending_file``: paths to the respective
158        generated .pending and .finished file. The .pending file path
159        may be ``None``.
160
161        If finished file is placed in a location outside the local
162        storage dir, the complete directory is removed
163        afterwards. Regular processors should put their stuff in
164        dedicated temporary dirs.
165
166        See datacenter.txt for more info about how this works.
167        """
168        basename = os.path.basename(source_path)
169        pending_name = basename
170        pending = False
171        finished_dir = os.path.join(self.storage, 'finished')
172        unfinished_dir = os.path.join(self.storage, 'unfinished')
173
174        if basename.endswith('.pending.csv'):
175            maybe_basename = "%s.csv" % basename.rsplit('.', 3)[0]
176            maybe_src = os.path.join(unfinished_dir, maybe_basename)
177            if os.path.isfile(maybe_src):
178                basename = maybe_basename
179                pending = True
180
181        base, ext = os.path.splitext(basename)
182        finished_name = "%s.%s.finished%s" % (base, mode, ext)
183        if not pending:
184            pending_name = "%s.%s.pending%s" % (base, mode, ext)
185
186        # Put .pending and .finished file into respective places...
187        pending_dest = os.path.join(self.storage, pending_name)
188        finished_dest = os.path.join(finished_dir, finished_name)
189        self._appendCSVFile(finished_file, finished_dest)
190        if pending_file is not None:
191            self._moveFile(pending_file, pending_dest)
192
193        # Put source file into final location...
194        finished_dest = os.path.join(finished_dir, basename)
195        unfinished_dest = os.path.join(unfinished_dir, basename)
196        if successful and not pending:
197            self._moveFile(source_path, finished_dest)
198        elif successful and pending:
199            self._moveFile(unfinished_dest, finished_dest)
200            os.unlink(source_path)
201        elif not successful and not pending:
202            self._moveFile(source_path, unfinished_dest)
203
204        # If finished and pending-file were created in a location
205        # outside datacenter storage, we remove it.
206        maybe_temp_dir = os.path.dirname(finished_file)
207        if os.path.commonprefix(
208            [self.storage, maybe_temp_dir]) != self.storage:
209            shutil.rmtree(maybe_temp_dir)
210        return
211
212    def _logfiles(self, basename):
213        """Get sorted logfiles starting with `basename`.
214        """
215        def numerical_suffix(name):
216            # return numerical suffix in `name` as number or 0.
217            suffix = name.rsplit('.', 1)[-1]
218            try:
219                return int(suffix)
220            except ValueError:
221                return 0
222            pass
223        files = [basename,]
224        for name in os.listdir(os.path.join(self.storage, 'logs')):
225            if RE_LOGFILE_BACKUP_NAME.match(name):
226                if name.rsplit('.', 1)[0] == basename:
227                    files.append(name)
228        return sorted(files, key=numerical_suffix, reverse=True)
229
230    def queryLogfiles(self, basename, query=None, limit=0, start=0):
231        """Search `query` in all logfiles starting with `basename`.
232
233        Returns an iterator of those lines in logfiles starting with
234        `basename` that match `query`. If you want the result as a
235        list, simply list() the iterator.
236
237        All logfiles with name `basename` and maybe some numerical
238        extension ('.1', '.2', ...) are searched for the `query` term
239        in correct chronological order. So, if you ask for a basename 'app.log',
240        then any file named 'app2.log', 'app.log.1', 'app.log',
241        etc. will be searched in that order.
242
243        The `query` is expected to be a string containing a regular
244        expression.
245
246        If `limit` is set to some numerical value, at most this number
247        of lines is returned.
248
249        With `start` you can give the number of first matched line to
250        return. `start` is zero-based, i.e. the first match has number
251        0, the scond one 1, etc.
252
253        Together with `limit` this allows some basic
254        batching. Please keep in mind that batching might give
255        unpredictable results, when logfiles change between two
256        requests. This is not a problem when only one file is searched
257        and changes include only appending new log messages.
258
259        Matches are found per line only (no multiline matches).
260
261        This method raises ValueError if some basic condition is not
262        met, for instance if the given query string is not a valid
263        regular expression.
264
265        Please note, that this exception will happen not before you
266        really fetch a result line.
267        """
268        try:
269            re_query = re.compile(query)
270        except:
271            raise ValueError('Invalid query string: %s' % query)
272
273        basename = basename.replace('/../', '')
274        files = self._logfiles(basename)
275
276        # Search the log files
277        num = 0
278        for name in files:
279            path = os.path.join(self.storage, 'logs', name)
280            if not os.path.isfile(path):
281                continue
282            for line in open(path, 'rb'):
283                if not re_query.search(line):
284                    continue
285                num += 1
286                if (num - 1) < start:
287                    continue
288                yield line
289
290                if limit and (num - limit >= start):
291                    raise StopIteration
292        pass
293
294class DataCenterFile(object):
295    """A description of a file stored in data center.
296    """
297    grok.implements(IDataCenterFile)
298
299    def __init__(self, context):
300        self.context = context
301        self.name = os.path.basename(self.context)
302        self.size = self.getSize()
303        self.uploaddate = self.getDate()
304        self.lines = self.getLinesNumber()
305
306    def getDate(self):
307        """Get a human readable datetime representation.
308        """
309        date = datetime.fromtimestamp(os.path.getctime(self.context))
310        return date.strftime("%Y-%m-%d %H:%M:%S")
311
312    def getTimeStamp(self):
313        """Get a (machine readable) timestamp.
314        """
315        return os.path.getctime(self.context)
316
317    def getSize(self):
318        """Get a human readable file size.
319        """
320        bytesize = os.path.getsize(self.context)
321        size = "%s bytes" % bytesize
322        units = ['kb', 'MB', 'GB']
323        for power, unit in reversed(list(enumerate(units))):
324            power += 1
325            if bytesize >= 1024 ** power:
326                size = "%.2f %s" % (bytesize/(1024.0**power), unit)
327                break
328        return size
329
330    def getLinesNumber(self):
331        """Get number of lines.
332        """
333        num = 0
334        for line in open(self.context, 'rb'):
335            num += 1
336        return num
337
338class LogFile(DataCenterFile):
339    """A description of a log file.
340    """
341    def __init__(self, context):
342        super(LogFile, self).__init__(context)
343        self._markers = dict()
344        self._parsed = False
345        self.userid = self.getUserId()
346        self.mode = self.getMode()
347        self.stats = self.getStats()
348        self.source = self.getSourcePath()
349
350    def _parseFile(self, maxline=10):
351        """Find markers in a file.
352        """
353        if self._parsed:
354            return
355        for line in open(self.context, 'rb'):
356            line = line.strip()
357            if not ':' in line:
358                continue
359            name, text = line.split(':', 1)
360            self._markers[name.lower()] = text
361        self._parsed = True
362        return
363
364    def _getMarker(self, marker):
365        marker = marker.lower()
366        if not self._parsed:
367            self._parseFile()
368        if marker in self._markers.keys():
369            return self._markers[marker]
370
371    def getUserId(self):
372        return self._getMarker('user') or '<UNKNOWN>'
373
374    def getMode(self):
375        return self._getMarker('mode') or '<NOT SET>'
376
377    def getStats(self):
378        return self._getMarker('processed') or '<Info not avail.>'
379
380    def getSourcePath(self):
381        return self._getMarker('source') or None
382
383
384class DataCenterStorageMovedEvent(ObjectEvent):
385    """An event fired, when datacenter storage moves.
386    """
387    grok.implements(IDataCenterStorageMovedEvent)
Note: See TracBrowser for help on using the repository browser.