source: main/waeup.kofa/backup-trunk/tools/fix_import_file.py @ 10272

Last change on this file since 10272 was 9194, checked in by Henrik Bettermann, 12 years ago

Add converter for boolean.

  • Property svn:keywords set to Id
File size: 14.1 KB
Line 
1## $Id: fix_import_file.py 9194 2012-09-17 08:06:47Z henrik $
2##
3## Copyright (C) 2012 Uli Fouquet & Henrik Bettermann
4## This program is free software; you can redistribute it and/or modify
5## it under the terms of the GNU General Public License as published by
6## the Free Software Foundation; either version 2 of the License, or
7## (at your option) any later version.
8##
9## This program is distributed in the hope that it will be useful,
10## but WITHOUT ANY WARRANTY; without even the implied warranty of
11## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12## GNU General Public License for more details.
13##
14## You should have received a copy of the GNU General Public License
15## along with this program; if not, write to the Free Software
16## Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17##
18"""
19Fix exports from old SRP portal and other data sources to make
20them importable by current portal.
21
22Usage:
23
24Change into this directory, set the options below (files are assumed
25to be in the same directory) and then run
26
27  python fix_import_file.py <filename>
28
29Errors/warnings will be displayed on the shell, the output will be put
30into the specified output file.
31
32
33The lgas.py module must be copied into the same folder where this script
34is started.
35"""
36import csv
37import datetime
38import re
39import sys
40
41try:
42    from lgas import LGAS
43except:
44    print 'ERROR: lgas.py is missing.'
45    sys.exit(1)
46
47def strip(string):
48    string = string.replace('_', '')
49    string = string.replace('/', '')
50    string = string.replace('-', '')
51    string = string.replace(' ', '')
52    string = string.lower()
53    return string
54
55LGAS_inverted_stripped = dict([(strip(i[1]), i[0]) for i in LGAS])
56LGAS_dict = dict(LGAS)
57
58##
59## CONFIGURATION SECTION
60##
61# keys are fieldnames in input file, values are methods of class
62# Converter (see below)
63OPTIONS = {
64    'student_id': 'student_id',
65    'sex': 'gender',
66    'birthday': 'date',
67    'marit_stat': 'marit_stat',
68    'session': 'session',
69    'entry_session': 'session',
70    'current_session': 'session',
71    'session_id': 'session',
72    'entry_mode': 'mode',
73    'reg_state': 'reg_state',
74    'password': 'password',
75    'phone': 'phone',
76    'nationality': 'nationality',
77    'level': 'level',
78    'start_level': 'level',
79    'end_level': 'level',
80    'level_id': 'level',
81    'current_level': 'level',
82    'semester': 'semester',
83    'application_category': 'application_category',
84    'lga': 'lga',
85    'order_id': 'no_int',
86    'uniben': 'former',
87    'nysc_year': 'year',
88    'alr_date': 'date',
89    'fst_sit_date': 'date',
90    'scd_sit_date': 'date',
91    'emp_start': 'date',
92    'emp_end': 'date',
93    'emp_start2': 'date',
94    'emp_end2': 'date',
95    'fst_sit_results': 'result',
96    'scd_sit_results': 'result',
97    'alr_results': 'result',
98    'email': 'email',
99    'fst_sit_type': 'sittype',
100    'scd_sit_type': 'sittype',
101    'resp_pay_reference': 'no_int',
102    'type': 'company',
103    'date': 'date',
104    'core_or_elective': 'bool',
105    'category': 'p_category',
106    'reg_transition': 'reg_state',  # we completely change this column,
107                                    # since reg_state import is usually intended
108    'transition': 'reg_transition',
109    'payment_date': 'date',
110    'validation_date': 'date',
111    }
112
113# Mapping input file colnames --> output file colnames
114COLNAME_MAPPING = {
115    # base data
116    'id': 'student_id',
117    'reg_state': 'state',
118    'reg_transition': 'state',
119    'jamb_reg_no': 'reg_number',
120    'matric_no': 'matric_number',
121    'birthday': 'date_of_birth',
122    'clr_ac_pin': 'clr_code',
123    # clearance
124    'hq_grade': 'hq_degree',
125    'uniben': 'former_matric',
126    'hq_type2': 'hq2_type',
127    'hq_grade2': 'hq2_degree',
128    'hq_school2': 'hq2_school',
129    'hq_matric_no2': 'hq2_matric_no',
130    'hq_session2': 'hq2_session',
131    'hq_disc2': 'hq2_disc',
132    'emp': 'employer',
133    'emp2': 'employer2',
134    'emp_position2': 'emp2_position',
135    'emp_start2': 'emp2_start',
136    'emp_end2': 'emp2_end',
137    'emp_reason2': 'emp2_reason',
138    # study course
139    'study_course': 'certificate',
140    # study level
141    'session': 'level_session',
142    'verdict': 'level_verdict',
143    # course ticket
144    'level_id': 'level',
145    'core_or_elective': 'mandatory',
146    # payment ticket
147    'order_id': 'p_id',
148    'status': 'p_state',
149    'category': 'p_category',
150    'resp_pay_reference': 'r_pay_reference',
151    'resp_desc': 'r_desc',
152    'resp_approved_amount': 'r_amount_approved',
153    'item': 'p_item',
154    'amount': 'amount_auth',
155    'resp_card_num': 'r_card_num',
156    'resp_code': 'r_code',
157    'date': 'creation_date',
158    'surcharge': 'surcharge_1',
159    'session_id': 'p_session',
160    'type': 'r_company',
161    }
162
163# Mapping input state --> output state
164REGSTATE_MAPPING = {
165    'student_created': 'created',
166    'admitted': 'admitted',
167    'objection_raised': 'clearance started',
168    'clearance_pin_entered': 'clearance started',
169    'clearance_requested': 'clearance requested',
170    'cleared_and_validated': 'cleared',
171    'school_fee_paid': 'school fee paid',
172    'returning': 'returning',
173    'courses_registered': 'courses registered',
174    'courses_validated': 'courses validated',
175    'admit': 'admitted',
176    'return': 'returning'
177    }
178
179##
180## END OF CONFIG
181##
182
183# Look for the first sequence of numbers
184RE_PHONE = re.compile('[^\d]*(\d*)[^\d]*')
185
186def convert_fieldnames(fieldnames):
187    """Replace input fieldnames by fieldnames of COLNAME_MAPPING.
188    """
189    # Remove whitespaces
190    header = dict([(name, name.strip()) for name in fieldnames])
191    for in_name, out_name in COLNAME_MAPPING.items():
192        if in_name not in header.values():
193            continue
194        # Inverse dictionary lookup
195        key = [key for key,value in header.items() if value==in_name][0]
196        header[key] = out_name
197    return header
198
199class Converters():
200    """Converters to turn old-style values into new ones.
201    """
202
203    @classmethod
204    def student_id(self, value, row):
205        """ 'A123456' --> 'EA123456'
206        """
207        if len(value) == 7:
208            return 'K' + value
209        return value
210
211    @classmethod
212    def reg_state(self, value, row):
213        """ 'courses_validated' --> 'courses validated'
214        """
215        return REGSTATE_MAPPING.get(value,value)
216
217    @classmethod
218    def reg_transition(self, value, row):
219        if value == "admitted":
220            return "admit"
221        if value == "returning":
222            return "return"
223        return value
224
225    @classmethod
226    def level(self, value, row):
227        """ '000' --> '10'
228        '800' --> '999' if pg student
229        """
230        try:
231            number = int(value)
232        except ValueError:
233            return 9999
234        if number == 0:
235            return 10
236        if row.get('entry_mode') and row.get('entry_mode').startswith('pg'):
237            return 999
238        return number
239
240    @classmethod
241    def semester(self, value, row):
242        """ '0' --> '9'
243        """
244        try:
245            number = int(value)
246        except ValueError:
247            return 9999
248        if number == 0:
249            return 9
250        return number
251
252    @classmethod
253    def application_category(self, value, row):
254        """ '' --> 'no'
255        """
256        if value == '':
257            return 'no'
258        return value
259
260    @classmethod
261    def lga(self, value, row):
262        """ Remove apostrophe
263        """
264        if value == 'akwa_ibom_uru_offong_oruko':
265            return 'akwa_ibom_urue-offong-oruko'
266        if value == 'edo_ohionmwon':
267            return 'edo_orhionmwon'
268
269        if value == 'nassarawa_nassarawa':
270            return 'nassarawa_nassawara'
271
272        if value == 'kogi_mopa-muro-mopi':
273            return 'kogi_mopa-muro'
274
275        if value == 'delta_osimili-north':
276            return 'delta_oshielli-north'
277
278        if value == 'delta_osimili':
279            return 'delta_oshimili'
280
281        if value == 'delta_osimili-south':
282            return 'delta_oshimili-south'
283        try:
284            value = value.replace("'","")
285        except:
286            return ''
287        lower = value.lower()
288        if lower in LGAS_dict.keys():
289            return lower
290        # If real names are given, let's see if a similar value
291        # in LGAS exist.
292        value = LGAS_inverted_stripped.get(strip(lower), value)
293        return value
294
295
296    @classmethod
297    def session(self, value, row):
298        """ '08' --> '2008'
299        '2008/2009' --> '2008'
300        """
301        if '/' in value:
302            numbers = value.split('/')
303            number = int(numbers[0])
304            if number in range(2000,2015):
305                return number
306            else:
307                return 9999
308        try:
309            number = int(value)
310        except ValueError:
311            #import pdb; pdb.set_trace()
312            return 9999
313        if number < 14:
314            return number + 2000
315        elif number in range(2000,2015):
316            return number
317        else:
318            return 9999
319
320    @classmethod
321    def former(self, value, row):
322        """ True --> yes
323        '2008/2009' --> '2008'
324        """
325        if value == 'True':
326            return 'yes'
327        return
328
329    @classmethod
330    def bool(self, value, row):
331        """ True --> 1
332        """
333        if value in ('TRUE', 'True'):
334            return '1'
335        elif value in ('FALSE', 'False'):
336            return '0'
337        return
338
339    @classmethod
340    def year(self, value, row):
341        """ '0' --> ''
342        """
343        if value == '0':
344            return
345        if value == 'None':
346            return
347        return value
348
349
350    @classmethod
351    def marit_stat(self, value, row):
352        """ 'True'/'False' --> 'married'/'unmarried'
353        """
354        if value in ('True','married'):
355            value = 'married'
356        elif value in ('False','unmarried'):
357            value = 'unmarried'
358        else:
359            value = ''
360        return value
361
362    @classmethod
363    def gender(self, value, row):
364        """ 'True'/'False' --> 'f'/'m'
365        """
366        if value.strip() in ('F', 'True','f'):
367            value = 'f'
368        elif value.strip() in ('M', 'False','m'):
369            value = 'm'
370        else:
371            value = ''
372        return value
373
374    @classmethod
375    def date(self, value, row):
376        """ 'yyyy/mm/dd' --> 'yyyy-mm-dd'
377        """
378        if value == "None":
379            value = ""
380        elif value == "":
381            value = ""
382        else:
383            value = value.replace('/', '-')
384            # We add the hash symbol to avoid automatic date transformation
385            # in Excel and Calc for further processing
386            value += '#'
387        return value
388
389    @classmethod
390    def no_int(self, value, row):
391        """ Add hash and skip numbers starting with 999999
392        """
393        # We add the hash symbol to avoid automatic number transformation
394        # in Excel and Calc for further processing
395        try:
396            intvalue = int(value)
397            value += '#'
398        except:
399            pass
400        if value.startswith('999999'):
401            return
402        return value
403
404    @classmethod
405    def mode(self, value, row):
406        if value == "transfer_fulltime":
407            return "transfer_ft"
408        if value == "ume_ft":
409            return "utme_ft"
410        return value
411
412    @classmethod
413    def password(self, value, row):
414        if value == "not set":
415            return ""
416        return value
417
418    @classmethod
419    def nationality(self, value, row):
420        if value == "nigeria":
421            return "NG"
422        if value == "niger":
423            return "NE"
424        return value
425
426    @classmethod
427    def sittype(self, value, row):
428        if value == "nabtec":
429            return "nabteb"
430        return value
431
432    @classmethod
433    def company(self, value, row):
434        if value == "online":
435            return "interswitch"
436        return value
437
438    @classmethod
439    def p_category(self, value, row):
440        if value == "acceptance":
441            return "clearance"
442        return value
443
444    @classmethod
445    def email(self, value, row):
446        return value.strip()
447
448    @classmethod
449    def phone(self, value, row):
450        """ '<num-seq1>-<num-seq2> asd' -> '--<num-seq1><num-seq2>'
451
452        Dashes and slashes are removed before looking for sequences
453        of numbers.
454        """
455        if not value:
456            return
457        value = value.strip('#')
458        value = value.replace('-', '')
459        value = value.replace('/', '')
460        match = RE_PHONE.match(value)
461        phone = match.groups()[0]
462        if value.startswith('234'):
463            value = '+' + value[:3] + '-' + value[3:]
464        else:
465            value = '-%s' % phone
466        return value + '#'
467
468    @classmethod
469    def result(self, value, row):
470        try:
471            liste = eval(value)
472        except:
473            return
474        if isinstance(liste,list):
475            return [(i[0].lower(),i[1]) for i in liste]
476        return
477
478
479def main():
480    input_file = '%s' % sys.argv[1]
481    output_file = '%s_edited.csv' % sys.argv[1].split('.')[0]
482    reader = csv.DictReader(open(input_file, 'rb'))
483    writer = None
484
485    for num, row in enumerate(reader):
486        if num == 0:
487            writer = csv.DictWriter(open(output_file, 'wb'), reader.fieldnames)
488            print "FIELDS: "
489            for x, y in enumerate(reader.fieldnames):
490                print x, y
491            header = convert_fieldnames(reader.fieldnames)
492            writer.writerow(header)
493        if row.get('reg_state') == 'student_created':
494            # We do not reimport student records which have never been accessed.
495            continue
496        if row.get('status') == 'started':
497            # We do not reimport started payments.
498            continue
499        for key, value in row.items():
500            # Remove unwanted whitespaces.
501            row[key] = row[key].strip()
502            if not key in OPTIONS.keys():
503                continue
504            conv_name = OPTIONS[key]
505            converter = getattr(Converters, conv_name, None)
506            if converter is None:
507                print "WARNING: cannot find converter %s" % conv_name
508                continue
509            row[key] = converter(row[key], row)
510        try:
511            writer.writerow(row)
512        except:
513            print row['student_id']
514
515    print "Output written to %s" % output_file
516
517
518if __name__ == '__main__':
519    if len(sys.argv) != 2:
520        print 'Usage: %s <filename>' % __file__
521        sys.exit(1)
522    main()
Note: See TracBrowser for help on using the repository browser.