source: main/waeup.kofa/trunk/src/waeup/kofa/datacenter.txt @ 8209

Last change on this file since 8209 was 7819, checked in by Henrik Bettermann, 13 years ago

KOFA -> Kofa

File size: 7.8 KB
Line 
1Kofa Data Center
2****************
3
4The Kofa data center cares for managing CSV files and importing then.
5
6.. :doctest:
7.. :layer: waeup.kofa.testing.KofaUnitTestLayer
8
9Creating a data center
10======================
11
12A data center can be created easily:
13
14    >>> from waeup.kofa.datacenter import DataCenter
15    >>> mydatacenter = DataCenter()
16    >>> mydatacenter
17    <waeup.kofa.datacenter.DataCenter object at 0x...>
18
19Each data center has a location in file system where files are stored:
20
21    >>> storagepath = mydatacenter.storage
22    >>> storagepath
23    '/tmp/tmp...'
24
25
26Managing the storage path
27-------------------------
28
29We can set another storage path:
30
31    >>> import os
32    >>> os.mkdir('newlocation')
33    >>> newpath = os.path.abspath('newlocation')
34    >>> mydatacenter.setStoragePath(newpath)
35    []
36
37The result here is a list of filenames, that could not be
38copied. Luckily, this list is empty.
39
40When we set a new storage path, we can tell to move all files in the
41old location to the new one. To see this feature in action, we first
42have to put a file into the old location:
43
44    >>> open(os.path.join(newpath, 'myfile.txt'), 'wb').write('hello')
45
46Now we can set a new location and the file will be copied:
47
48    >>> verynewpath = os.path.abspath('verynewlocation')
49    >>> os.mkdir(verynewpath)
50
51    >>> mydatacenter.setStoragePath(verynewpath, move=True)
52    []
53
54    >>> storagepath = mydatacenter.storage
55    >>> 'myfile.txt' in os.listdir(verynewpath)
56    True
57
58We remove the created file to have a clean testing environment for
59upcoming examples:
60
61    >>> os.unlink(os.path.join(storagepath, 'myfile.txt'))
62
63Uploading files
64===============
65
66We can get a list of files stored in that location:
67
68    >>> mydatacenter.getFiles()
69    []
70
71Let's put some file in the storage:
72
73    >>> import os
74    >>> filepath = os.path.join(storagepath, 'data.csv')
75    >>> open(filepath, 'wb').write('Some Content\n')
76
77Now we can find a file:
78
79    >>> mydatacenter.getFiles()
80    [<waeup.kofa.datacenter.DataCenterFile object at 0x...>]
81
82As we can see, the actual file is wrapped by a convenience wrapper,
83that enables us to fetch some data about the file. The data returned
84is formatted in strings, so that it can easily be put into output
85pages:
86
87    >>> datafile = mydatacenter.getFiles()[0]
88    >>> datafile.getSize()
89    '13 bytes'
90
91    >>> datafile.getDate() # Nearly current datetime...
92    '...'
93
94Clean up:
95
96    >>> import shutil
97    >>> shutil.rmtree(newpath)
98    >>> shutil.rmtree(verynewpath)
99
100
101Distributing processed files
102============================
103
104When files were processed by a batch processor, we can put the
105resulting files into desired destinations.
106
107We recreate the datacenter root in case it is missing:
108
109    >>> import os
110    >>> dc_root = mydatacenter.storage
111    >>> fin_dir = os.path.join(dc_root, 'finished')
112    >>> unfin_dir = os.path.join(dc_root, 'unfinished')
113
114    >>> def recreate_dc_storage():
115    ...   if os.path.exists(dc_root):
116    ...     shutil.rmtree(dc_root)
117    ...   os.mkdir(dc_root)
118    ...   mydatacenter.setStoragePath(mydatacenter.storage)
119    >>> recreate_dc_storage()
120
121We define a function that creates a set of faked result files:
122
123    >>> import os
124    >>> import tempfile
125    >>> def create_fake_results(source_basename, create_pending=True):
126    ...   tmp_dir = tempfile.mkdtemp()
127    ...   src = os.path.join(dc_root, source_basename)
128    ...   pending_src = None
129    ...   if create_pending:
130    ...     pending_src = os.path.join(tmp_dir, 'mypendingsource.csv')
131    ...   finished_src = os.path.join(tmp_dir, 'myfinishedsource.csv')
132    ...   for path in (src, pending_src, finished_src):
133    ...     if path is not None:
134    ...       open(path, 'wb').write('blah')
135    ...   return tmp_dir, src, finished_src, pending_src
136
137Now we can create the set of result files, that typically come after a
138successful processing of a regular source:
139
140Now we can try to distribute those files. Let's start with a source
141file, that was processed successfully:
142
143    >>> tmp_dir, src, finished_src, pending_src = create_fake_results(
144    ...  'mysource.csv', create_pending=False)
145    >>> mydatacenter.distProcessedFiles(True, src, finished_src,
146    ...                            pending_src, mode='create')
147    >>> sorted(os.listdir(dc_root))
148    ['finished', 'logs', 'unfinished']
149
150    >>> sorted(os.listdir(fin_dir))
151    ['mysource.create.finished.csv', 'mysource.csv']
152
153    >>> sorted(os.listdir(unfin_dir))
154    []
155
156The created dir will be removed for us by the datacenter. This way we
157can assured, that less temporary dirs are left hanging around:
158
159    >>> os.path.exists(tmp_dir)
160    False
161
162The root dir is empty, while the original file and the file containing
163all processed data were moved to'finished/'.
164
165Now we restart, but this time we fake an erranous action:
166
167    >>> recreate_dc_storage()
168    >>> tmp_dir, src, finished_src, pending_src = create_fake_results(
169    ...  'mysource.csv')
170    >>> mydatacenter.distProcessedFiles(False, src, finished_src,
171    ...                                 pending_src, mode='create')
172    >>> sorted(os.listdir(dc_root))
173    ['finished', 'logs', 'mysource.create.pending.csv', 'unfinished']
174
175    >>> sorted(os.listdir(fin_dir))
176    ['mysource.create.finished.csv']
177
178    >>> sorted(os.listdir(unfin_dir))
179    ['mysource.csv']
180
181While the original source was moved to the 'unfinished' dir, the
182pending file went to the root and the set of already processed items
183are stored in finished/.
184
185We fake processing the pending file and assume that everything went
186well this time:
187
188    >>> tmp_dir, src, finished_src, pending_src = create_fake_results(
189    ...  'mysource.create.pending.csv', create_pending=False)
190    >>> mydatacenter.distProcessedFiles(True, src, finished_src,
191    ...                                 pending_src, mode='create')
192
193    >>> sorted(os.listdir(dc_root))
194    ['finished', 'logs', 'unfinished']
195
196    >>> sorted(os.listdir(fin_dir))
197    ['mysource.create.finished.csv', 'mysource.csv']
198
199    >>> sorted(os.listdir(unfin_dir))
200    []
201
202The result is the same as in the first case shown above.
203
204We restart again, but this time we fake several non-working imports in
205a row.
206
207We start with a faulty start-import:
208
209    >>> recreate_dc_storage()
210    >>> tmp_dir, src, finished_src, pending_src = create_fake_results(
211    ...  'mysource.csv')
212    >>> mydatacenter.distProcessedFiles(False, src, finished_src,
213    ...                                 pending_src, mode='create')
214
215We try to process the pending file, which fails again:
216
217    >>> tmp_dir, src, finished_src, pending_src = create_fake_results(
218    ...  'mysource.create.pending.csv')
219    >>> mydatacenter.distProcessedFiles(False, src, finished_src,
220    ...                                 pending_src, mode='create')
221
222We try to process the new pending file:
223
224    >>> tmp_dir, src, finished_src, pending_src = create_fake_results(
225    ...  'mysource.create.pending.csv')
226    >>> mydatacenter.distProcessedFiles(False, src, finished_src,
227    ...                                 pending_src, mode='create')
228
229    >>> sorted(os.listdir(dc_root))
230    ['finished', 'logs', 'mysource.create.pending.csv', 'unfinished']
231
232    >>> sorted(os.listdir(fin_dir))
233    ['mysource.create.finished.csv']
234
235    >>> sorted(os.listdir(unfin_dir))
236    ['mysource.csv']
237
238Finally, we process the pending file and everything works:
239
240    >>> tmp_dir, src, finished_src, pending_src = create_fake_results(
241    ...  'mysource.create.pending.csv', create_pending=False)
242    >>> mydatacenter.distProcessedFiles(True, src, finished_src,
243    ...                                 pending_src, mode='create')
244
245    >>> sorted(os.listdir(dc_root))
246    ['finished', 'logs', 'unfinished']
247
248    >>> sorted(os.listdir(fin_dir))
249    ['mysource.create.finished.csv', 'mysource.csv']
250
251    >>> sorted(os.listdir(unfin_dir))
252    []
253
254The root dir is empty (contains no input files) and only the files in
255finished-subdirectory remain.
256
257Clean up:
258
259    >>> shutil.rmtree(verynewpath)
Note: See TracBrowser for help on using the repository browser.