source: main/waeup.kofa/branches/uli-async-update/tools/fix_import_file.py @ 9174

Last change on this file since 9174 was 9169, checked in by uli, 12 years ago

Merge changes from trunk, r8786-HEAD

  • Property svn:keywords set to Id
File size: 13.9 KB
Line 
1## $Id: fix_import_file.py 9169 2012-09-10 11:05:07Z uli $
2##
3## Copyright (C) 2012 Uli Fouquet & Henrik Bettermann
4## This program is free software; you can redistribute it and/or modify
5## it under the terms of the GNU General Public License as published by
6## the Free Software Foundation; either version 2 of the License, or
7## (at your option) any later version.
8##
9## This program is distributed in the hope that it will be useful,
10## but WITHOUT ANY WARRANTY; without even the implied warranty of
11## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12## GNU General Public License for more details.
13##
14## You should have received a copy of the GNU General Public License
15## along with this program; if not, write to the Free Software
16## Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17##
18"""
19Fix exports from old SRP portal and other data sources to make
20them importable by current portal.
21
22Usage:
23
24Change into this directory, set the options below (files are assumed
25to be in the same directory) and then run
26
27  python fix_import_file.py <filename>
28
29Errors/warnings will be displayed on the shell, the output will be put
30into the specified output file.
31
32
33The lgas.py module must be copied into the same folder where this script
34is started.
35"""
36import csv
37import datetime
38import re
39import sys
40
41try:
42    from lgas import LGAS
43except:
44    print 'ERROR: lgas.py is missing.'
45    sys.exit(1)
46
47def strip(string):
48    string = string.replace('_', '')
49    string = string.replace('/', '')
50    string = string.replace('-', '')
51    string = string.replace(' ', '')
52    string = string.lower()
53    return string
54
55LGAS_inverted_stripped = dict([(strip(i[1]), i[0]) for i in LGAS])
56LGAS_dict = dict(LGAS)
57
58##
59## CONFIGURATION SECTION
60##
61# keys are fieldnames in input file, values are methods of class
62# Converter (see below)
63OPTIONS = {
64    'student_id': 'student_id',
65    'sex': 'gender',
66    'birthday': 'date',
67    'marit_stat': 'marit_stat',
68    'session': 'session',
69    'entry_session': 'session',
70    'current_session': 'session',
71    'session_id': 'session',
72    'entry_mode': 'mode',
73    'reg_state': 'reg_state',
74    'password': 'password',
75    'phone': 'phone',
76    'nationality': 'nationality',
77    'level': 'level',
78    'start_level': 'level',
79    'end_level': 'level',
80    'level_id': 'level',
81    'current_level': 'level',
82    'semester': 'semester',
83    'application_category': 'application_category',
84    'lga': 'lga',
85    'order_id': 'no_int',
86    'uniben': 'former',
87    'nysc_year': 'year',
88    'alr_date': 'date',
89    'fst_sit_date': 'date',
90    'scd_sit_date': 'date',
91    'emp_start': 'date',
92    'emp_end': 'date',
93    'emp_start2': 'date',
94    'emp_end2': 'date',
95    'fst_sit_results': 'result',
96    'scd_sit_results': 'result',
97    'alr_results': 'result',
98    'email': 'email',
99    'fst_sit_type': 'sittype',
100    'scd_sit_type': 'sittype',
101    'resp_pay_reference': 'no_int',
102    'type': 'company',
103    'date': 'date',
104    'category': 'p_category',
105    'reg_transition': 'reg_state',  # we completely change this column,
106                                    # since reg_state import is usually intended
107    'transition': 'reg_transition',
108    'payment_date': 'date',
109    'validation_date': 'date',
110    }
111
112# Mapping input file colnames --> output file colnames
113COLNAME_MAPPING = {
114    # base data
115    'id': 'student_id',
116    'reg_state': 'state',
117    'reg_transition': 'state',
118    'jamb_reg_no': 'reg_number',
119    'matric_no': 'matric_number',
120    'birthday': 'date_of_birth',
121    'clr_ac_pin': 'clr_code',
122    # clearance
123    'hq_grade': 'hq_degree',
124    'uniben': 'former_matric',
125    'hq_type2': 'hq2_type',
126    'hq_grade2': 'hq2_degree',
127    'hq_school2': 'hq2_school',
128    'hq_matric_no2': 'hq2_matric_no',
129    'hq_session2': 'hq2_session',
130    'hq_disc2': 'hq2_disc',
131    'emp': 'employer',
132    'emp2': 'employer2',
133    'emp_position2': 'emp2_position',
134    'emp_start2': 'emp2_start',
135    'emp_end2': 'emp2_end',
136    'emp_reason2': 'emp2_reason',
137    # study course
138    'study_course': 'certificate',
139    # study level
140    'session': 'level_session',
141    'verdict': 'level_verdict',
142    # course ticket
143    'level_id': 'level',
144    'core_or_elective': 'mandatory',
145    # payment ticket
146    'order_id': 'p_id',
147    'status': 'p_state',
148    'category': 'p_category',
149    'resp_pay_reference': 'r_pay_reference',
150    'resp_desc': 'r_desc',
151    'resp_approved_amount': 'r_amount_approved',
152    'item': 'p_item',
153    'amount': 'amount_auth',
154    'resp_card_num': 'r_card_num',
155    'resp_code': 'r_code',
156    'date': 'creation_date',
157    'surcharge': 'surcharge_1',
158    'session_id': 'p_session',
159    'type': 'r_company',
160    }
161
162# Mapping input state --> output state
163REGSTATE_MAPPING = {
164    'student_created': 'created',
165    'admitted': 'admitted',
166    'objection_raised': 'clearance started',
167    'clearance_pin_entered': 'clearance started',
168    'clearance_requested': 'clearance requested',
169    'cleared_and_validated': 'cleared',
170    'school_fee_paid': 'school fee paid',
171    'returning': 'returning',
172    'courses_registered': 'courses registered',
173    'courses_validated': 'courses validated',
174    'admit': 'admitted',
175    'return': 'returning'
176    }
177
178##
179## END OF CONFIG
180##
181
182# Look for the first sequence of numbers
183RE_PHONE = re.compile('[^\d]*(\d*)[^\d]*')
184
185def convert_fieldnames(fieldnames):
186    """Replace input fieldnames by fieldnames of COLNAME_MAPPING.
187    """
188    # Remove whitespaces
189    header = dict([(name, name.strip()) for name in fieldnames])
190    for in_name, out_name in COLNAME_MAPPING.items():
191        if in_name not in header.values():
192            continue
193        # Inverse dictionary lookup
194        key = [key for key,value in header.items() if value==in_name][0]
195        header[key] = out_name
196    return header
197
198class Converters():
199    """Converters to turn old-style values into new ones.
200    """
201
202    @classmethod
203    def student_id(self, value, row):
204        """ 'A123456' --> 'EA123456'
205        """
206        if len(value) == 7:
207            return 'B' + value
208        return value
209
210    @classmethod
211    def reg_state(self, value, row):
212        """ 'courses_validated' --> 'courses validated'
213        """
214        return REGSTATE_MAPPING.get(value,value)
215
216    @classmethod
217    def reg_transition(self, value, row):
218        if value == "admitted":
219            return "admit"
220        if value == "returning":
221            return "return"
222        return value
223
224    @classmethod
225    def level(self, value, row):
226        """ '000' --> '10'
227        '800' --> '999' if pg student
228        """
229        try:
230            number = int(value)
231        except ValueError:
232            return 9999
233        if number == 0:
234            return 10
235        if row.get('entry_mode') and row.get('entry_mode').startswith('pg'):
236            return 999
237        return number
238
239    @classmethod
240    def semester(self, value, row):
241        """ '0' --> '9'
242        """
243        try:
244            number = int(value)
245        except ValueError:
246            return 9999
247        if number == 0:
248            return 9
249        return number
250
251    @classmethod
252    def application_category(self, value, row):
253        """ '' --> 'no'
254        """
255        if value == '':
256            return 'no'
257        return value
258
259    @classmethod
260    def lga(self, value, row):
261        """ Remove apostrophe
262        """
263        if value == 'akwa_ibom_uru_offong_oruko':
264            return 'akwa_ibom_urue-offong-oruko'
265        if value == 'edo_ohionmwon':
266            return 'edo_orhionmwon'
267
268        if value == 'nassarawa_nassarawa':
269            return 'nassarawa_nassawara'
270
271        if value == 'kogi_mopa-muro-mopi':
272            return 'kogi_mopa-muro'
273
274        if value == 'delta_osimili-north':
275            return 'delta_oshielli-north'
276
277        if value == 'delta_osimili':
278            return 'delta_oshimili'
279
280        if value == 'delta_osimili-south':
281            return 'delta_oshimili-south'
282        try:
283            value = value.replace("'","")
284        except:
285            return ''
286        lower = value.lower()
287        if lower in LGAS_dict.keys():
288            return lower
289        # If real names are given, let's see if a similar value
290        # in LGAS exist.
291        value = LGAS_inverted_stripped.get(strip(lower), value)
292        return value
293
294
295    @classmethod
296    def session(self, value, row):
297        """ '08' --> '2008'
298        '2008/2009' --> '2008'
299        """
300        if '/' in value:
301            numbers = value.split('/')
302            number = int(numbers[0])
303            if number in range(2000,2015):
304                return number
305            else:
306                return 9999
307        try:
308            number = int(value)
309        except ValueError:
310            #import pdb; pdb.set_trace()
311            return 9999
312        if number < 14:
313            return number + 2000
314        elif number in range(2000,2015):
315            return number
316        else:
317            return 9999
318
319    @classmethod
320    def former(self, value, row):
321        """ True --> yes
322        '2008/2009' --> '2008'
323        """
324        if value == 'True':
325            return 'yes'
326        return
327
328    @classmethod
329    def year(self, value, row):
330        """ '0' --> ''
331        """
332        if value == '0':
333            return
334        if value == 'None':
335            return
336        return value
337
338
339    @classmethod
340    def marit_stat(self, value, row):
341        """ 'True'/'False' --> 'married'/'unmarried'
342        """
343        if value in ('True','married'):
344            value = 'married'
345        elif value in ('False','unmarried'):
346            value = 'unmarried'
347        else:
348            value = ''
349        return value
350
351    @classmethod
352    def gender(self, value, row):
353        """ 'True'/'False' --> 'f'/'m'
354        """
355        if value.strip() in ('F', 'True','f'):
356            value = 'f'
357        elif value.strip() in ('M', 'False','m'):
358            value = 'm'
359        else:
360            value = ''
361        return value
362
363    @classmethod
364    def date(self, value, row):
365        """ 'yyyy/mm/dd' --> 'yyyy-mm-dd'
366        """
367        if value == "None":
368            value = ""
369        elif value == "":
370            value = ""
371        else:
372            value = value.replace('/', '-')
373            # We add the hash symbol to avoid automatic date transformation
374            # in Excel and Calc for further processing
375            value += '#'
376        return value
377
378    @classmethod
379    def no_int(self, value, row):
380        """ Add hash and skip numbers starting with 999999
381        """
382        # We add the hash symbol to avoid automatic number transformation
383        # in Excel and Calc for further processing
384        try:
385            intvalue = int(value)
386            value += '#'
387        except:
388            pass
389        if value.startswith('999999'):
390            return
391        return value
392
393    @classmethod
394    def mode(self, value, row):
395        if value == "transfer_fulltime":
396            return "transfer_ft"
397        if value == "ume_ft":
398            return "utme_ft"
399        return value
400
401    @classmethod
402    def password(self, value, row):
403        if value == "not set":
404            return ""
405        return value
406
407    @classmethod
408    def nationality(self, value, row):
409        if value == "nigeria":
410            return "NG"
411        if value == "niger":
412            return "NE"
413        return value
414
415    @classmethod
416    def sittype(self, value, row):
417        if value == "nabtec":
418            return "nabteb"
419        return value
420
421    @classmethod
422    def company(self, value, row):
423        if value == "online":
424            return "interswitch"
425        return value
426
427    @classmethod
428    def p_category(self, value, row):
429        if value == "acceptance":
430            return "clearance"
431        return value
432
433    @classmethod
434    def email(self, value, row):
435        return value.strip()
436
437    @classmethod
438    def phone(self, value, row):
439        """ '<num-seq1>-<num-seq2> asd' -> '--<num-seq1><num-seq2>'
440
441        Dashes and slashes are removed before looking for sequences
442        of numbers.
443        """
444        if not value:
445            return
446        value = value.strip('#')
447        value = value.replace('-', '')
448        value = value.replace('/', '')
449        match = RE_PHONE.match(value)
450        phone = match.groups()[0]
451        if value.startswith('234'):
452            value = '+' + value[:3] + '-' + value[3:]
453        else:
454            value = '-%s' % phone
455        return value + '#'
456
457    @classmethod
458    def result(self, value, row):
459        try:
460            liste = eval(value)
461        except:
462            return
463        if isinstance(liste,list):
464            return [(i[0].lower(),i[1]) for i in liste]
465        return
466
467
468def main():
469    input_file = '%s' % sys.argv[1]
470    output_file = '%s_edited.csv' % sys.argv[1].split('.')[0]
471    reader = csv.DictReader(open(input_file, 'rb'))
472    writer = None
473
474    for num, row in enumerate(reader):
475        if num == 0:
476            writer = csv.DictWriter(open(output_file, 'wb'), reader.fieldnames)
477            print "FIELDS: "
478            for x, y in enumerate(reader.fieldnames):
479                print x, y
480            header = convert_fieldnames(reader.fieldnames)
481            writer.writerow(header)
482        if row.get('reg_state') == 'student_created':
483            # We do not reimport student records which have never been accessed.
484            continue
485        if row.get('status') == 'started':
486            # We do not reimport started payments.
487            continue
488        for key, value in row.items():
489            # Remove unwanted whitespaces.
490            row[key] = row[key].strip()
491            if not key in OPTIONS.keys():
492                continue
493            conv_name = OPTIONS[key]
494            converter = getattr(Converters, conv_name, None)
495            if converter is None:
496                print "WARNING: cannot find converter %s" % conv_name
497                continue
498            row[key] = converter(row[key], row)
499        try:
500            writer.writerow(row)
501        except:
502            print row['student_id']
503
504    print "Output written to %s" % output_file
505
506
507if __name__ == '__main__':
508    if len(sys.argv) != 2:
509        print 'Usage: %s <filename>' % __file__
510        sys.exit(1)
511    main()
Note: See TracBrowser for help on using the repository browser.