source: main/waeup.kofa/trunk/tools/fix_import_file.py @ 9250

Last change on this file since 9250 was 9194, checked in by Henrik Bettermann, 12 years ago

Add converter for boolean.

  • Property svn:keywords set to Id
File size: 14.1 KB
RevLine 
[7512]1## $Id: fix_import_file.py 9194 2012-09-17 08:06:47Z henrik $
2##
[7518]3## Copyright (C) 2012 Uli Fouquet & Henrik Bettermann
[7512]4## This program is free software; you can redistribute it and/or modify
5## it under the terms of the GNU General Public License as published by
6## the Free Software Foundation; either version 2 of the License, or
7## (at your option) any later version.
8##
9## This program is distributed in the hope that it will be useful,
10## but WITHOUT ANY WARRANTY; without even the implied warranty of
11## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12## GNU General Public License for more details.
13##
14## You should have received a copy of the GNU General Public License
15## along with this program; if not, write to the Free Software
16## Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17##
18"""
[8781]19Fix exports from old SRP portal and other data sources to make
20them importable by current portal.
[7512]21
22Usage:
23
24Change into this directory, set the options below (files are assumed
25to be in the same directory) and then run
26
[7518]27  python fix_import_file.py <filename>
[7512]28
29Errors/warnings will be displayed on the shell, the output will be put
30into the specified output file.
[8781]31
32
33The lgas.py module must be copied into the same folder where this script
34is started.
[7512]35"""
[7573]36import csv
37import datetime
[7575]38import re
[7573]39import sys
[7512]40
[8781]41try:
42    from lgas import LGAS
43except:
44    print 'ERROR: lgas.py is missing.'
45    sys.exit(1)
46
47def strip(string):
48    string = string.replace('_', '')
49    string = string.replace('/', '')
50    string = string.replace('-', '')
51    string = string.replace(' ', '')
52    string = string.lower()
53    return string
54
55LGAS_inverted_stripped = dict([(strip(i[1]), i[0]) for i in LGAS])
56LGAS_dict = dict(LGAS)
57
[7512]58##
59## CONFIGURATION SECTION
60##
61# keys are fieldnames in input file, values are methods of class
62# Converter (see below)
63OPTIONS = {
[8478]64    'student_id': 'student_id',
[7512]65    'sex': 'gender',
66    'birthday': 'date',
[7514]67    'marit_stat': 'marit_stat',
[7537]68    'session': 'session',
[7514]69    'entry_session': 'session',
70    'current_session': 'session',
[7602]71    'session_id': 'session',
[7632]72    'entry_mode': 'mode',
[7526]73    'reg_state': 'reg_state',
74    'password': 'password',
[7575]75    'phone': 'phone',
[8478]76    'nationality': 'nationality',
[7602]77    'level': 'level',
78    'start_level': 'level',
79    'end_level': 'level',
80    'level_id': 'level',
81    'current_level': 'level',
82    'semester': 'semester',
83    'application_category': 'application_category',
[7610]84    'lga': 'lga',
[7628]85    'order_id': 'no_int',
[8938]86    'uniben': 'former',
87    'nysc_year': 'year',
88    'alr_date': 'date',
89    'fst_sit_date': 'date',
90    'scd_sit_date': 'date',
91    'emp_start': 'date',
92    'emp_end': 'date',
93    'emp_start2': 'date',
94    'emp_end2': 'date',
95    'fst_sit_results': 'result',
96    'scd_sit_results': 'result',
97    'alr_results': 'result',
[8939]98    'email': 'email',
99    'fst_sit_type': 'sittype',
100    'scd_sit_type': 'sittype',
[8941]101    'resp_pay_reference': 'no_int',
102    'type': 'company',
103    'date': 'date',
[9194]104    'core_or_elective': 'bool',
[8945]105    'category': 'p_category',
[9007]106    'reg_transition': 'reg_state',  # we completely change this column,
107                                    # since reg_state import is usually intended
108    'transition': 'reg_transition',
[9158]109    'payment_date': 'date',
[9160]110    'validation_date': 'date',
[7512]111    }
[7516]112
113# Mapping input file colnames --> output file colnames
114COLNAME_MAPPING = {
[7666]115    # base data
[9009]116    'id': 'student_id',
[8478]117    'reg_state': 'state',
[9007]118    'reg_transition': 'state',
[7518]119    'jamb_reg_no': 'reg_number',
[7645]120    'matric_no': 'matric_number',
[7516]121    'birthday': 'date_of_birth',
[7526]122    'clr_ac_pin': 'clr_code',
[8941]123    # clearance
124    'hq_grade': 'hq_degree',
125    'uniben': 'former_matric',
126    'hq_type2': 'hq2_type',
127    'hq_grade2': 'hq2_degree',
128    'hq_school2': 'hq2_school',
129    'hq_matric_no2': 'hq2_matric_no',
130    'hq_session2': 'hq2_session',
131    'hq_disc2': 'hq2_disc',
132    'emp': 'employer',
133    'emp2': 'employer2',
134    'emp_position2': 'emp2_position',
135    'emp_start2': 'emp2_start',
136    'emp_end2': 'emp2_end',
137    'emp_reason2': 'emp2_reason',
[7550]138    # study course
[7530]139    'study_course': 'certificate',
[7550]140    # study level
[7537]141    'session': 'level_session',
142    'verdict': 'level_verdict',
[7550]143    # course ticket
144    'level_id': 'level',
[7666]145    'core_or_elective': 'mandatory',
[7628]146    # payment ticket
147    'order_id': 'p_id',
148    'status': 'p_state',
149    'category': 'p_category',
150    'resp_pay_reference': 'r_pay_reference',
151    'resp_desc': 'r_desc',
152    'resp_approved_amount': 'r_amount_approved',
153    'item': 'p_item',
154    'amount': 'amount_auth',
155    'resp_card_num': 'r_card_num',
156    'resp_code': 'r_code',
157    'date': 'creation_date',
158    'surcharge': 'surcharge_1',
159    'session_id': 'p_session',
[8941]160    'type': 'r_company',
[7516]161    }
[7526]162
[8478]163# Mapping input state --> output state
[7526]164REGSTATE_MAPPING = {
165    'student_created': 'created',
166    'admitted': 'admitted',
[7610]167    'objection_raised': 'clearance started',
[7526]168    'clearance_pin_entered': 'clearance started',
169    'clearance_requested': 'clearance requested',
170    'cleared_and_validated': 'cleared',
171    'school_fee_paid': 'school fee paid',
172    'returning': 'returning',
173    'courses_registered': 'courses registered',
174    'courses_validated': 'courses validated',
[9007]175    'admit': 'admitted',
176    'return': 'returning'
[7526]177    }
178
[7512]179##
180## END OF CONFIG
181##
182
[7575]183# Look for the first sequence of numbers
184RE_PHONE = re.compile('[^\d]*(\d*)[^\d]*')
185
[7516]186def convert_fieldnames(fieldnames):
187    """Replace input fieldnames by fieldnames of COLNAME_MAPPING.
188    """
[9009]189    # Remove whitespaces
[9019]190    header = dict([(name, name.strip()) for name in fieldnames])
[7516]191    for in_name, out_name in COLNAME_MAPPING.items():
[9019]192        if in_name not in header.values():
[7516]193            continue
[9019]194        # Inverse dictionary lookup
195        key = [key for key,value in header.items() if value==in_name][0]
196        header[key] = out_name
[7514]197    return header
198
[7512]199class Converters():
200    """Converters to turn old-style values into new ones.
201    """
[8478]202
[7512]203    @classmethod
[9009]204    def student_id(self, value, row):
[8478]205        """ 'A123456' --> 'EA123456'
206        """
207        if len(value) == 7:
[9194]208            return 'K' + value
[8478]209        return value
210
211    @classmethod
[9009]212    def reg_state(self, value, row):
[7526]213        """ 'courses_validated' --> 'courses validated'
214        """
215        return REGSTATE_MAPPING.get(value,value)
216
217    @classmethod
[9009]218    def reg_transition(self, value, row):
[9007]219        if value == "admitted":
220            return "admit"
221        if value == "returning":
222            return "return"
223        return value
224
225    @classmethod
[9009]226    def level(self, value, row):
[7602]227        """ '000' --> '10'
[9009]228        '800' --> '999' if pg student
[7602]229        """
230        try:
231            number = int(value)
232        except ValueError:
233            return 9999
234        if number == 0:
235            return 10
[9158]236        if row.get('entry_mode') and row.get('entry_mode').startswith('pg'):
[9009]237            return 999
[7602]238        return number
239
240    @classmethod
[9009]241    def semester(self, value, row):
[7602]242        """ '0' --> '9'
243        """
244        try:
245            number = int(value)
246        except ValueError:
247            return 9999
248        if number == 0:
249            return 9
250        return number
251
252    @classmethod
[9009]253    def application_category(self, value, row):
[7602]254        """ '' --> 'no'
255        """
256        if value == '':
257            return 'no'
258        return value
259
[7610]260    @classmethod
[9009]261    def lga(self, value, row):
[7610]262        """ Remove apostrophe
263        """
264        if value == 'akwa_ibom_uru_offong_oruko':
265            return 'akwa_ibom_urue-offong-oruko'
[8939]266        if value == 'edo_ohionmwon':
267            return 'edo_orhionmwon'
268
269        if value == 'nassarawa_nassarawa':
270            return 'nassarawa_nassawara'
271
272        if value == 'kogi_mopa-muro-mopi':
273            return 'kogi_mopa-muro'
274
275        if value == 'delta_osimili-north':
276            return 'delta_oshielli-north'
277
278        if value == 'delta_osimili':
279            return 'delta_oshimili'
280
281        if value == 'delta_osimili-south':
282            return 'delta_oshimili-south'
[7610]283        try:
284            value = value.replace("'","")
285        except:
286            return ''
[8784]287        lower = value.lower()
288        if lower in LGAS_dict.keys():
289            return lower
[8781]290        # If real names are given, let's see if a similar value
291        # in LGAS exist.
[8784]292        value = LGAS_inverted_stripped.get(strip(lower), value)
[7610]293        return value
[7602]294
[7610]295
[7602]296    @classmethod
[9009]297    def session(self, value, row):
[7514]298        """ '08' --> '2008'
[8478]299        '2008/2009' --> '2008'
[7514]300        """
[8478]301        if '/' in value:
302            numbers = value.split('/')
303            number = int(numbers[0])
304            if number in range(2000,2015):
305                return number
306            else:
307                return 9999
[7514]308        try:
309            number = int(value)
310        except ValueError:
[7602]311            #import pdb; pdb.set_trace()
[7514]312            return 9999
313        if number < 14:
314            return number + 2000
315        elif number in range(2000,2015):
316            return number
317        else:
318            return 9999
319
320    @classmethod
[9009]321    def former(self, value, row):
[8938]322        """ True --> yes
323        '2008/2009' --> '2008'
324        """
325        if value == 'True':
326            return 'yes'
327        return
328
329    @classmethod
[9194]330    def bool(self, value, row):
331        """ True --> 1
332        """
333        if value in ('TRUE', 'True'):
334            return '1'
335        elif value in ('FALSE', 'False'):
336            return '0'
337        return
338
339    @classmethod
[9009]340    def year(self, value, row):
[8938]341        """ '0' --> ''
342        """
343        if value == '0':
344            return
345        if value == 'None':
346            return
347        return value
348
349
350    @classmethod
[9009]351    def marit_stat(self, value, row):
[7514]352        """ 'True'/'False' --> 'married'/'unmarried'
353        """
[7610]354        if value in ('True','married'):
[7514]355            value = 'married'
[7610]356        elif value in ('False','unmarried'):
[7514]357            value = 'unmarried'
358        else:
359            value = ''
360        return value
361
362    @classmethod
[9009]363    def gender(self, value, row):
[7526]364        """ 'True'/'False' --> 'f'/'m'
[7514]365        """
[8785]366        if value.strip() in ('F', 'True','f'):
[7526]367            value = 'f'
[8785]368        elif value.strip() in ('M', 'False','m'):
[7526]369            value = 'm'
[7514]370        else:
371            value = ''
372        return value
373
374    @classmethod
[9009]375    def date(self, value, row):
[7512]376        """ 'yyyy/mm/dd' --> 'yyyy-mm-dd'
377        """
378        if value == "None":
379            value = ""
380        elif value == "":
381            value = ""
382        else:
383            value = value.replace('/', '-')
[7514]384            # We add the hash symbol to avoid automatic date transformation
385            # in Excel and Calc for further processing
386            value += '#'
[7512]387        return value
388
389    @classmethod
[9009]390    def no_int(self, value, row):
[8941]391        """ Add hash and skip numbers starting with 999999
[7628]392        """
393        # We add the hash symbol to avoid automatic number transformation
394        # in Excel and Calc for further processing
[8941]395        try:
396            intvalue = int(value)
397            value += '#'
398        except:
399            pass
400        if value.startswith('999999'):
401            return
[7628]402        return value
403
404    @classmethod
[9009]405    def mode(self, value, row):
[7632]406        if value == "transfer_fulltime":
407            return "transfer_ft"
[8522]408        if value == "ume_ft":
409            return "utme_ft"
[7632]410        return value
411
412    @classmethod
[9009]413    def password(self, value, row):
[7526]414        if value == "not set":
415            return ""
416        return value
417
[7575]418    @classmethod
[9009]419    def nationality(self, value, row):
[8478]420        if value == "nigeria":
421            return "NG"
[8939]422        if value == "niger":
423            return "NE"
[8478]424        return value
425
426    @classmethod
[9009]427    def sittype(self, value, row):
[8939]428        if value == "nabtec":
429            return "nabteb"
430        return value
431
432    @classmethod
[9009]433    def company(self, value, row):
[8941]434        if value == "online":
435            return "interswitch"
436        return value
437
438    @classmethod
[9009]439    def p_category(self, value, row):
[8941]440        if value == "acceptance":
441            return "clearance"
442        return value
443
444    @classmethod
[9009]445    def email(self, value, row):
[8939]446        return value.strip()
447
448    @classmethod
[9009]449    def phone(self, value, row):
[7575]450        """ '<num-seq1>-<num-seq2> asd' -> '--<num-seq1><num-seq2>'
[7526]451
[7575]452        Dashes and slashes are removed before looking for sequences
453        of numbers.
454        """
[8938]455        if not value:
456            return
[9007]457        value = value.strip('#')
[7575]458        value = value.replace('-', '')
459        value = value.replace('/', '')
460        match = RE_PHONE.match(value)
461        phone = match.groups()[0]
[9007]462        if value.startswith('234'):
463            value = '+' + value[:3] + '-' + value[3:]
464        else:
465            value = '-%s' % phone
[8938]466        return value + '#'
[7575]467
[8938]468    @classmethod
[9009]469    def result(self, value, row):
[8938]470        try:
471            liste = eval(value)
472        except:
473            return
474        if isinstance(liste,list):
475            return [(i[0].lower(),i[1]) for i in liste]
476        return
[7575]477
[9007]478
[7572]479def main():
[7573]480    input_file = '%s' % sys.argv[1]
481    output_file = '%s_edited.csv' % sys.argv[1].split('.')[0]
482    reader = csv.DictReader(open(input_file, 'rb'))
[7572]483    writer = None
[7512]484
[7572]485    for num, row in enumerate(reader):
486        if num == 0:
[7573]487            writer = csv.DictWriter(open(output_file, 'wb'), reader.fieldnames)
[7572]488            print "FIELDS: "
489            for x, y in enumerate(reader.fieldnames):
490                print x, y
491            header = convert_fieldnames(reader.fieldnames)
492            writer.writerow(header)
[8939]493        if row.get('reg_state') == 'student_created':
[9009]494            # We do not reimport student records which have never been accessed.
[8939]495            continue
[8941]496        if row.get('status') == 'started':
[9009]497            # We do not reimport started payments.
[8941]498            continue
[7572]499        for key, value in row.items():
[9009]500            # Remove unwanted whitespaces.
501            row[key] = row[key].strip()
[7572]502            if not key in OPTIONS.keys():
503                continue
504            conv_name = OPTIONS[key]
505            converter = getattr(Converters, conv_name, None)
506            if converter is None:
507                print "WARNING: cannot find converter %s" % conv_name
508                continue
[9009]509            row[key] = converter(row[key], row)
[7602]510        try:
511            writer.writerow(row)
512        except:
513            print row['student_id']
[7512]514
[7573]515    print "Output written to %s" % output_file
[7572]516
517
518if __name__ == '__main__':
519    if len(sys.argv) != 2:
520        print 'Usage: %s <filename>' % __file__
521        sys.exit(1)
522    main()
Note: See TracBrowser for help on using the repository browser.