source: main/waeup.kofa/branches/0.2/tools/fix_import_file.py @ 15085

Last change on this file since 15085 was 9478, checked in by Henrik Bettermann, 12 years ago

Add converter for amounts and other things.

  • Property svn:keywords set to Id
File size: 15.5 KB
Line 
1## $Id: fix_import_file.py 9478 2012-10-30 20:19:06Z henrik $
2##
3## Copyright (C) 2012 Uli Fouquet & Henrik Bettermann
4## This program is free software; you can redistribute it and/or modify
5## it under the terms of the GNU General Public License as published by
6## the Free Software Foundation; either version 2 of the License, or
7## (at your option) any later version.
8##
9## This program is distributed in the hope that it will be useful,
10## but WITHOUT ANY WARRANTY; without even the implied warranty of
11## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12## GNU General Public License for more details.
13##
14## You should have received a copy of the GNU General Public License
15## along with this program; if not, write to the Free Software
16## Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17##
18"""
19Fix exports from old SRP portal and other data sources to make
20them importable by current portal.
21
22Usage:
23
24Change into this directory, set the options below (files are assumed
25to be in the same directory) and then run
26
27  python fix_import_file.py <filename>
28
29Errors/warnings will be displayed on the shell, the output will be put
30into the specified output file.
31
32
33The lgas.py module must be copied into the same folder where this script
34is started.
35"""
36import csv
37import datetime
38import os
39import re
40import sys
41
42try:
43    from lgas import LGAS
44except:
45    print 'ERROR: lgas.py is missing.'
46    sys.exit(1)
47
48def strip(string):
49    string = string.replace('_', '')
50    string = string.replace('/', '')
51    string = string.replace('-', '')
52    string = string.replace(' ', '')
53    string = string.lower()
54    return string
55
56LGAS_inverted_stripped = dict([(strip(i[1]), i[0]) for i in LGAS])
57LGAS_dict = dict(LGAS)
58
59##
60## CONFIGURATION SECTION
61##
62# keys are fieldnames in input file, values are methods of class
63# Converter (see below)
64OPTIONS = {
65    'student_id': 'student_id',
66    'sex': 'gender',
67    'birthday': 'date',
68    'marit_stat': 'marit_stat',
69    'session': 'session',
70    'entry_session': 'session',
71    'current_session': 'session',
72    'session_id': 'session',
73    'entry_mode': 'mode',
74    'reg_state': 'reg_state',
75    'password': 'password',
76    'phone': 'phone',
77    'nationality': 'nationality',
78    'level': 'level',
79    'start_level': 'level',
80    'end_level': 'level',
81    'level_id': 'level',
82    'current_level': 'level',
83    'semester': 'semester',
84    'application_category': 'application_category',
85    'lga': 'lga',
86    'order_id': 'no_int',
87    'uniben': 'former',
88    'nysc_year': 'year',
89    'alr_date': 'date',
90    'fst_sit_date': 'date',
91    'scd_sit_date': 'date',
92    'emp_start': 'date',
93    'emp_end': 'date',
94    'emp_start2': 'date',
95    'emp_end2': 'date',
96    'fst_sit_results': 'result',
97    'scd_sit_results': 'result',
98    'alr_results': 'result',
99    'email': 'email',
100    'fst_sit_type': 'sittype',
101    'scd_sit_type': 'sittype',
102    'resp_pay_reference': 'no_int',
103    'type': 'company',
104    'date': 'date',
105    'core_or_elective': 'bool',
106    'category': 'p_category',
107    'reg_transition': 'reg_state',  # we completely change this column,
108                                    # since reg_state import is usually intended
109    'transition': 'reg_transition',
110    'payment_date': 'date',
111    'validation_date': 'date',
112    'resp_approved_amount': 'amount',
113    'amount': 'amount',
114    'amount_auth': 'amount',
115    'surcharge': 'amount'
116    }
117
118# Mapping input file colnames --> output file colnames
119COLNAME_MAPPING = {
120    # base data
121    'id': 'student_id',
122    'reg_state': 'state',
123    'reg_transition': 'state',
124    'jamb_reg_no': 'reg_number',
125    'matric_no': 'matric_number',
126    'birthday': 'date_of_birth',
127    'clr_ac_pin': 'clr_code',
128    # clearance
129    'hq_grade': 'hq_degree',
130    'uniben': 'former_matric',
131    'hq_type2': 'hq2_type',
132    'hq_grade2': 'hq2_degree',
133    'hq_school2': 'hq2_school',
134    'hq_matric_no2': 'hq2_matric_no',
135    'hq_session2': 'hq2_session',
136    'hq_disc2': 'hq2_disc',
137    'emp': 'employer',
138    'emp2': 'employer2',
139    'emp_position2': 'emp2_position',
140    'emp_start2': 'emp2_start',
141    'emp_end2': 'emp2_end',
142    'emp_reason2': 'emp2_reason',
143    # study course
144    'study_course': 'certificate',
145    # study level
146    'session': 'level_session',
147    'verdict': 'level_verdict',
148    # course ticket
149    'level_id': 'level',
150    'core_or_elective': 'mandatory',
151    # payment ticket
152    'order_id': 'p_id',
153    'status': 'p_state',
154    'category': 'p_category',
155    'resp_pay_reference': 'r_pay_reference',
156    'resp_desc': 'r_desc',
157    'resp_approved_amount': 'r_amount_approved',
158    'item': 'p_item',
159    'amount': 'amount_auth',
160    'resp_card_num': 'r_card_num',
161    'resp_code': 'r_code',
162    'date': 'creation_date',
163    'surcharge': 'surcharge_1',
164    'session_id': 'p_session',
165    'type': 'r_company',
166    'old_id': 'old_id',
167    }
168
169# Mapping input state --> output state
170REGSTATE_MAPPING = {
171    'student_created': 'created',
172    'admitted': 'admitted',
173    'objection_raised': 'clearance started',
174    'clearance_pin_entered': 'clearance started',
175    'clearance_requested': 'clearance requested',
176    'cleared_and_validated': 'cleared',
177    'school_fee_paid': 'school fee paid',
178    'returning': 'returning',
179    'courses_registered': 'courses registered',
180    'courses_validated': 'courses validated',
181    'admit': 'admitted',
182    'return': 'returning'
183    }
184
185# Mapping of special cases, where new id is not deductible from old id
186# Set to `None`, if no such special cases should be considered.
187ID_MAP_CSV = None
188ID_MAP_CSV = "id_mapping.csv"
189
190##
191## END OF CONFIG
192##
193
194# Look for the first sequence of numbers
195RE_PHONE = re.compile('[^\d]*(\d*)[^\d]*')
196
197def get_id_mapping():
198    """Returns a dict mapping from old (SRP) ids to new ids.
199
200    The dict is read from ID_MAP_CSV file. If this var is set to
201    ``None`` an empty dict is returned. The ID_MAP_CSV contains only
202    the student ids of those students, for which the standard method
203    (new_id=CHAR+old_id) does not work.
204    """
205    if ID_MAP_CSV is None:
206        return {}
207    if not os.path.isfile(ID_MAP_CSV):
208        raise IOError(
209            "No such file for mapping old to new ids: %s" % ID_MAP_CSV)
210    result = dict()
211    reader = csv.DictReader(open(ID_MAP_CSV, 'rb'))
212    for row in reader:
213        result[row['student_id']] = row['new_id']
214    return result
215
216
217def convert_fieldnames(fieldnames):
218    """Replace input fieldnames by fieldnames of COLNAME_MAPPING.
219    """
220    # Remove whitespaces
221    header = dict([(name, name.strip()) for name in fieldnames])
222    for in_name, out_name in COLNAME_MAPPING.items():
223        if in_name not in header.values():
224            continue
225        # Inverse dictionary lookup
226        key = [key for key,value in header.items() if value==in_name][0]
227        header[key] = out_name
228    return header
229
230class Converters():
231    """Converters to turn old-style values into new ones.
232    """
233
234    old_new_id_map = get_id_mapping()
235
236    @classmethod
237    def student_id(cls, value, row):
238        """ 'A123456' --> 'EA123456'
239        """
240        value = cls.old_new_id_map.get(value, value)
241        if len(value) == 7:
242            return 'B' + value
243        return value
244
245    @classmethod
246    def reg_state(self, value, row):
247        """ 'courses_validated' --> 'courses validated'
248        """
249        return REGSTATE_MAPPING.get(value,value)
250
251    @classmethod
252    def reg_transition(self, value, row):
253        if value == "admitted":
254            return "admit"
255        if value == "returning":
256            return "return"
257        return value
258
259    @classmethod
260    def level(self, value, row):
261        """ '000' --> '10'
262        '800' --> '999' if pg student
263        """
264        try:
265            number = int(value)
266        except ValueError:
267            return 9999
268        if number == 0:
269            return 10
270        if row.get('entry_mode') and row.get('entry_mode').startswith('pg'):
271            return 999
272        return number
273
274    @classmethod
275    def semester(self, value, row):
276        """ '0' --> '9'
277        """
278        try:
279            number = int(value)
280        except ValueError:
281            return 9999
282        if number == 0:
283            return 9
284        return number
285
286    @classmethod
287    def application_category(self, value, row):
288        """ '' --> 'no'
289        """
290        if value == '':
291            return 'no'
292        return value
293
294    @classmethod
295    def lga(self, value, row):
296        """ Remove apostrophe
297        """
298        if value == 'akwa_ibom_uru_offong_oruko':
299            return 'akwa_ibom_urue-offong-oruko'
300        if value == 'edo_ohionmwon':
301            return 'edo_orhionmwon'
302
303        if value == 'nassarawa_nassarawa':
304            return 'nassarawa_nassawara'
305
306        if value == 'kogi_mopa-muro-mopi':
307            return 'kogi_mopa-muro'
308
309        if value == 'delta_osimili-north':
310            return 'delta_oshielli-north'
311
312        if value == 'delta_osimili':
313            return 'delta_oshimili'
314
315        if value == 'delta_osimili-south':
316            return 'delta_oshimili-south'
317        try:
318            value = value.replace("'","")
319        except:
320            return ''
321        lower = value.lower()
322        if lower in LGAS_dict.keys():
323            return lower
324        # If real names are given, let's see if a similar value
325        # in LGAS exist.
326        value = value.replace('_',' ')
327        value = LGAS_inverted_stripped.get(strip(lower), value)
328        return value
329
330
331    @classmethod
332    def session(self, value, row):
333        """ '08' --> '2008'
334        '2008/2009' --> '2008'
335        """
336        if '/' in value:
337            numbers = value.split('/')
338            number = int(numbers[0])
339            if number in range(2000,2015):
340                return number
341            else:
342                return 9999
343        try:
344            number = int(value)
345        except ValueError:
346            return 9999
347        if number < 14:
348            return number + 2000
349        elif number in range(2000,2015):
350            return number
351        else:
352            return 9999
353
354    @classmethod
355    def former(self, value, row):
356        """ True --> yes
357        '2008/2009' --> '2008'
358        """
359        if value == 'True':
360            return 'yes'
361        return
362
363    @classmethod
364    def bool(self, value, row):
365        """ True --> 1
366        """
367        if value in ('TRUE', 'True'):
368            return '1'
369        elif value in ('FALSE', 'False'):
370            return '0'
371        return
372
373    @classmethod
374    def year(self, value, row):
375        """ '0' --> ''
376        """
377        if value == '0':
378            return
379        if value == 'None':
380            return
381        return value
382
383
384    @classmethod
385    def marit_stat(self, value, row):
386        """ 'True'/'False' --> 'married'/'unmarried'
387        """
388        if value in ('True','married'):
389            value = 'married'
390        elif value in ('False','unmarried'):
391            value = 'unmarried'
392        else:
393            value = ''
394        return value
395
396    @classmethod
397    def gender(self, value, row):
398        """ 'True'/'False' --> 'f'/'m'
399        """
400        if value.strip() in ('F', 'True','f'):
401            value = 'f'
402        elif value.strip() in ('M', 'False','m'):
403            value = 'm'
404        else:
405            value = ''
406        return value
407
408    @classmethod
409    def date(self, value, row):
410        """ 'yyyy/mm/dd' --> 'yyyy-mm-dd'
411        """
412        if value == "None":
413            value = ""
414        elif value == "":
415            value = ""
416        else:
417            value = value.replace('/', '-')
418            # We add the hash symbol to avoid automatic date transformation
419            # in Excel and Calc for further processing
420            value += '#'
421        return value
422
423    @classmethod
424    def no_int(self, value, row):
425        """ Add hash and skip numbers starting with 999999
426        """
427        # We add the hash symbol to avoid automatic number transformation
428        # in Excel and Calc for further processing
429        try:
430            intvalue = int(value)
431            value += '#'
432        except:
433            pass
434        if value.startswith('999999'):
435            return
436        return value
437
438    @classmethod
439    def amount(self, value, row):
440        """ Amounts must be integers
441        """
442        try:
443            return float(value)
444        except:
445            return
446
447    @classmethod
448    def mode(self, value, row):
449        if value == "transfer_fulltime":
450            return "transfer_ft"
451        if value == "ume_ft":
452            return "utme_ft"
453        return value
454
455    @classmethod
456    def password(self, value, row):
457        if value == "not set":
458            return ""
459        return value
460
461    @classmethod
462    def nationality(self, value, row):
463        if value in ('nigeria', 'Nigeria'):
464            return "NG"
465        if value in ('niger', 'Niger'):
466            return "NE"
467        return value
468
469    @classmethod
470    def sittype(self, value, row):
471        if value == "nabtec":
472            return "nabteb"
473        return value
474
475    @classmethod
476    def company(self, value, row):
477        if value == "online":
478            return "interswitch"
479        return value
480
481    @classmethod
482    def p_category(self, value, row):
483        if value == "acceptance":
484            return "clearance"
485        return value
486
487    @classmethod
488    def email(self, value, row):
489        return value.strip()
490
491    @classmethod
492    def phone(self, value, row):
493        """ '<num-seq1>-<num-seq2> asd' -> '--<num-seq1><num-seq2>'
494
495        Dashes and slashes are removed before looking for sequences
496        of numbers.
497        """
498        if not value:
499            return
500        if len(value) < 5:
501            return
502        value = value.strip('#')
503        value = value.replace('-', '')
504        value = value.replace('/', '')
505        match = RE_PHONE.match(value)
506        phone = match.groups()[0]
507        if value.startswith('234'):
508            value = '+' + value[:3] + '-' + value[3:]
509        elif value.startswith('+234'):
510            value = value[:4] + '-' + value[4:]
511        else:
512            value = '-%s' % phone
513        return value + '#'
514
515    @classmethod
516    def result(self, value, row):
517        try:
518            liste = eval(value)
519        except:
520            return
521        if isinstance(liste,list):
522            return [(i[0].lower(),i[1]) for i in liste]
523        return
524
525
526def main():
527    input_file = '%s' % sys.argv[1]
528    output_file = '%s_edited.csv' % sys.argv[1].split('.')[0]
529    reader = csv.DictReader(open(input_file, 'rb'))
530    writer = None
531
532    for num, row in enumerate(reader):
533        if num == 0:
534            writer = csv.DictWriter(open(output_file, 'wb'), reader.fieldnames)
535            print "FIELDS: "
536            for x, y in enumerate(reader.fieldnames):
537                print x, y
538            header = convert_fieldnames(reader.fieldnames)
539            writer.writerow(header)
540        #if row.get('reg_state') == 'student_created':
541        #    # We do not reimport student records which have never been accessed.
542        #    continue
543        if row.get('status') == 'started':
544            # We do not reimport started payments.
545            continue
546        for key, value in row.items():
547            # Remove unwanted whitespaces.
548            row[key] = row[key].strip()
549            if not key in OPTIONS.keys():
550                continue
551            conv_name = OPTIONS[key]
552            converter = getattr(Converters, conv_name, None)
553            if converter is None:
554                print "WARNING: cannot find converter %s" % conv_name
555                continue
556            row[key] = converter(row[key], row)
557        try:
558            writer.writerow(row)
559        except:
560            print row['student_id']
561
562    print "Output written to %s" % output_file
563
564
565if __name__ == '__main__':
566    if len(sys.argv) != 2:
567        print 'Usage: %s <filename>' % __file__
568        sys.exit(1)
569    main()
Note: See TracBrowser for help on using the repository browser.