source: main/waeup.kofa/trunk/tools/fix_import_file.py @ 9041

Last change on this file since 9041 was 9019, checked in by Henrik Bettermann, 12 years ago

Avoid that script has to be run twice.

  • Property svn:keywords set to Id
File size: 13.8 KB
Line 
1## $Id: fix_import_file.py 9019 2012-07-18 14:16:21Z henrik $
2##
3## Copyright (C) 2012 Uli Fouquet & Henrik Bettermann
4## This program is free software; you can redistribute it and/or modify
5## it under the terms of the GNU General Public License as published by
6## the Free Software Foundation; either version 2 of the License, or
7## (at your option) any later version.
8##
9## This program is distributed in the hope that it will be useful,
10## but WITHOUT ANY WARRANTY; without even the implied warranty of
11## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12## GNU General Public License for more details.
13##
14## You should have received a copy of the GNU General Public License
15## along with this program; if not, write to the Free Software
16## Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17##
18"""
19Fix exports from old SRP portal and other data sources to make
20them importable by current portal.
21
22Usage:
23
24Change into this directory, set the options below (files are assumed
25to be in the same directory) and then run
26
27  python fix_import_file.py <filename>
28
29Errors/warnings will be displayed on the shell, the output will be put
30into the specified output file.
31
32
33The lgas.py module must be copied into the same folder where this script
34is started.
35"""
36import csv
37import datetime
38import re
39import sys
40
41try:
42    from lgas import LGAS
43except:
44    print 'ERROR: lgas.py is missing.'
45    sys.exit(1)
46
47def strip(string):
48    string = string.replace('_', '')
49    string = string.replace('/', '')
50    string = string.replace('-', '')
51    string = string.replace(' ', '')
52    string = string.lower()
53    return string
54
55LGAS_inverted_stripped = dict([(strip(i[1]), i[0]) for i in LGAS])
56LGAS_dict = dict(LGAS)
57
58##
59## CONFIGURATION SECTION
60##
61# keys are fieldnames in input file, values are methods of class
62# Converter (see below)
63OPTIONS = {
64    'student_id': 'student_id',
65    'sex': 'gender',
66    'birthday': 'date',
67    'marit_stat': 'marit_stat',
68    'session': 'session',
69    'entry_session': 'session',
70    'current_session': 'session',
71    'session_id': 'session',
72    'entry_mode': 'mode',
73    'reg_state': 'reg_state',
74    'password': 'password',
75    'phone': 'phone',
76    'nationality': 'nationality',
77    'level': 'level',
78    'start_level': 'level',
79    'end_level': 'level',
80    'level_id': 'level',
81    'current_level': 'level',
82    'semester': 'semester',
83    'application_category': 'application_category',
84    'lga': 'lga',
85    'order_id': 'no_int',
86    'uniben': 'former',
87    'nysc_year': 'year',
88    'alr_date': 'date',
89    'fst_sit_date': 'date',
90    'scd_sit_date': 'date',
91    'emp_start': 'date',
92    'emp_end': 'date',
93    'emp_start2': 'date',
94    'emp_end2': 'date',
95    'fst_sit_results': 'result',
96    'scd_sit_results': 'result',
97    'alr_results': 'result',
98    'email': 'email',
99    'fst_sit_type': 'sittype',
100    'scd_sit_type': 'sittype',
101    'resp_pay_reference': 'no_int',
102    'type': 'company',
103    'date': 'date',
104    'category': 'p_category',
105    'reg_transition': 'reg_state',  # we completely change this column,
106                                    # since reg_state import is usually intended
107    'transition': 'reg_transition',
108    }
109
110# Mapping input file colnames --> output file colnames
111COLNAME_MAPPING = {
112    # base data
113    'id': 'student_id',
114    'reg_state': 'state',
115    'reg_transition': 'state',
116    'jamb_reg_no': 'reg_number',
117    'matric_no': 'matric_number',
118    'birthday': 'date_of_birth',
119    'clr_ac_pin': 'clr_code',
120    # clearance
121    'hq_grade': 'hq_degree',
122    'uniben': 'former_matric',
123    'hq_type2': 'hq2_type',
124    'hq_grade2': 'hq2_degree',
125    'hq_school2': 'hq2_school',
126    'hq_matric_no2': 'hq2_matric_no',
127    'hq_session2': 'hq2_session',
128    'hq_disc2': 'hq2_disc',
129    'emp': 'employer',
130    'emp2': 'employer2',
131    'emp_position2': 'emp2_position',
132    'emp_start2': 'emp2_start',
133    'emp_end2': 'emp2_end',
134    'emp_reason2': 'emp2_reason',
135    # study course
136    'study_course': 'certificate',
137    # study level
138    'session': 'level_session',
139    'verdict': 'level_verdict',
140    # course ticket
141    'level_id': 'level',
142    'core_or_elective': 'mandatory',
143    # payment ticket
144    'order_id': 'p_id',
145    'status': 'p_state',
146    'category': 'p_category',
147    'resp_pay_reference': 'r_pay_reference',
148    'resp_desc': 'r_desc',
149    'resp_approved_amount': 'r_amount_approved',
150    'item': 'p_item',
151    'amount': 'amount_auth',
152    'resp_card_num': 'r_card_num',
153    'resp_code': 'r_code',
154    'date': 'creation_date',
155    'surcharge': 'surcharge_1',
156    'session_id': 'p_session',
157    'type': 'r_company',
158    }
159
160# Mapping input state --> output state
161REGSTATE_MAPPING = {
162    'student_created': 'created',
163    'admitted': 'admitted',
164    'objection_raised': 'clearance started',
165    'clearance_pin_entered': 'clearance started',
166    'clearance_requested': 'clearance requested',
167    'cleared_and_validated': 'cleared',
168    'school_fee_paid': 'school fee paid',
169    'returning': 'returning',
170    'courses_registered': 'courses registered',
171    'courses_validated': 'courses validated',
172    'admit': 'admitted',
173    'return': 'returning'
174    }
175
176##
177## END OF CONFIG
178##
179
180# Look for the first sequence of numbers
181RE_PHONE = re.compile('[^\d]*(\d*)[^\d]*')
182
183def convert_fieldnames(fieldnames):
184    """Replace input fieldnames by fieldnames of COLNAME_MAPPING.
185    """
186    # Remove whitespaces
187    header = dict([(name, name.strip()) for name in fieldnames])
188    for in_name, out_name in COLNAME_MAPPING.items():
189        if in_name not in header.values():
190            continue
191        # Inverse dictionary lookup
192        key = [key for key,value in header.items() if value==in_name][0]
193        header[key] = out_name
194    return header
195
196class Converters():
197    """Converters to turn old-style values into new ones.
198    """
199
200    @classmethod
201    def student_id(self, value, row):
202        """ 'A123456' --> 'EA123456'
203        """
204        if len(value) == 7:
205            return 'B' + value
206        return value
207
208    @classmethod
209    def reg_state(self, value, row):
210        """ 'courses_validated' --> 'courses validated'
211        """
212        return REGSTATE_MAPPING.get(value,value)
213
214    @classmethod
215    def reg_transition(self, value, row):
216        if value == "admitted":
217            return "admit"
218        if value == "returning":
219            return "return"
220        return value
221
222    @classmethod
223    def level(self, value, row):
224        """ '000' --> '10'
225        '800' --> '999' if pg student
226        """
227        try:
228            number = int(value)
229        except ValueError:
230            return 9999
231        if number == 0:
232            return 10
233        if row.get('entry_mode').startswith('pg'):
234            return 999
235        return number
236
237    @classmethod
238    def semester(self, value, row):
239        """ '0' --> '9'
240        """
241        try:
242            number = int(value)
243        except ValueError:
244            return 9999
245        if number == 0:
246            return 9
247        return number
248
249    @classmethod
250    def application_category(self, value, row):
251        """ '' --> 'no'
252        """
253        if value == '':
254            return 'no'
255        return value
256
257    @classmethod
258    def lga(self, value, row):
259        """ Remove apostrophe
260        """
261        if value == 'akwa_ibom_uru_offong_oruko':
262            return 'akwa_ibom_urue-offong-oruko'
263        if value == 'edo_ohionmwon':
264            return 'edo_orhionmwon'
265
266        if value == 'nassarawa_nassarawa':
267            return 'nassarawa_nassawara'
268
269        if value == 'kogi_mopa-muro-mopi':
270            return 'kogi_mopa-muro'
271
272        if value == 'delta_osimili-north':
273            return 'delta_oshielli-north'
274
275        if value == 'delta_osimili':
276            return 'delta_oshimili'
277
278        if value == 'delta_osimili-south':
279            return 'delta_oshimili-south'
280        try:
281            value = value.replace("'","")
282        except:
283            return ''
284        lower = value.lower()
285        if lower in LGAS_dict.keys():
286            return lower
287        # If real names are given, let's see if a similar value
288        # in LGAS exist.
289        value = LGAS_inverted_stripped.get(strip(lower), value)
290        return value
291
292
293    @classmethod
294    def session(self, value, row):
295        """ '08' --> '2008'
296        '2008/2009' --> '2008'
297        """
298        if '/' in value:
299            numbers = value.split('/')
300            number = int(numbers[0])
301            if number in range(2000,2015):
302                return number
303            else:
304                return 9999
305        try:
306            number = int(value)
307        except ValueError:
308            #import pdb; pdb.set_trace()
309            return 9999
310        if number < 14:
311            return number + 2000
312        elif number in range(2000,2015):
313            return number
314        else:
315            return 9999
316
317    @classmethod
318    def former(self, value, row):
319        """ True --> yes
320        '2008/2009' --> '2008'
321        """
322        if value == 'True':
323            return 'yes'
324        return
325
326    @classmethod
327    def year(self, value, row):
328        """ '0' --> ''
329        """
330        if value == '0':
331            return
332        if value == 'None':
333            return
334        return value
335
336
337    @classmethod
338    def marit_stat(self, value, row):
339        """ 'True'/'False' --> 'married'/'unmarried'
340        """
341        if value in ('True','married'):
342            value = 'married'
343        elif value in ('False','unmarried'):
344            value = 'unmarried'
345        else:
346            value = ''
347        return value
348
349    @classmethod
350    def gender(self, value, row):
351        """ 'True'/'False' --> 'f'/'m'
352        """
353        if value.strip() in ('F', 'True','f'):
354            value = 'f'
355        elif value.strip() in ('M', 'False','m'):
356            value = 'm'
357        else:
358            value = ''
359        return value
360
361    @classmethod
362    def date(self, value, row):
363        """ 'yyyy/mm/dd' --> 'yyyy-mm-dd'
364        """
365        if value == "None":
366            value = ""
367        elif value == "":
368            value = ""
369        else:
370            value = value.replace('/', '-')
371            # We add the hash symbol to avoid automatic date transformation
372            # in Excel and Calc for further processing
373            value += '#'
374        return value
375
376    @classmethod
377    def no_int(self, value, row):
378        """ Add hash and skip numbers starting with 999999
379        """
380        # We add the hash symbol to avoid automatic number transformation
381        # in Excel and Calc for further processing
382        try:
383            intvalue = int(value)
384            value += '#'
385        except:
386            pass
387        if value.startswith('999999'):
388            return
389        return value
390
391    @classmethod
392    def mode(self, value, row):
393        if value == "transfer_fulltime":
394            return "transfer_ft"
395        if value == "ume_ft":
396            return "utme_ft"
397        return value
398
399    @classmethod
400    def password(self, value, row):
401        if value == "not set":
402            return ""
403        return value
404
405    @classmethod
406    def nationality(self, value, row):
407        if value == "nigeria":
408            return "NG"
409        if value == "niger":
410            return "NE"
411        return value
412
413    @classmethod
414    def sittype(self, value, row):
415        if value == "nabtec":
416            return "nabteb"
417        return value
418
419    @classmethod
420    def company(self, value, row):
421        if value == "online":
422            return "interswitch"
423        return value
424
425    @classmethod
426    def p_category(self, value, row):
427        if value == "acceptance":
428            return "clearance"
429        return value
430
431    @classmethod
432    def email(self, value, row):
433        return value.strip()
434
435    @classmethod
436    def phone(self, value, row):
437        """ '<num-seq1>-<num-seq2> asd' -> '--<num-seq1><num-seq2>'
438
439        Dashes and slashes are removed before looking for sequences
440        of numbers.
441        """
442        if not value:
443            return
444        value = value.strip('#')
445        value = value.replace('-', '')
446        value = value.replace('/', '')
447        match = RE_PHONE.match(value)
448        phone = match.groups()[0]
449        if value.startswith('234'):
450            value = '+' + value[:3] + '-' + value[3:]
451        else:
452            value = '-%s' % phone
453        return value + '#'
454
455    @classmethod
456    def result(self, value, row):
457        try:
458            liste = eval(value)
459        except:
460            return
461        if isinstance(liste,list):
462            return [(i[0].lower(),i[1]) for i in liste]
463        return
464
465
466def main():
467    input_file = '%s' % sys.argv[1]
468    output_file = '%s_edited.csv' % sys.argv[1].split('.')[0]
469    reader = csv.DictReader(open(input_file, 'rb'))
470    writer = None
471
472    for num, row in enumerate(reader):
473        if num == 0:
474            writer = csv.DictWriter(open(output_file, 'wb'), reader.fieldnames)
475            print "FIELDS: "
476            for x, y in enumerate(reader.fieldnames):
477                print x, y
478            header = convert_fieldnames(reader.fieldnames)
479            writer.writerow(header)
480        if row.get('reg_state') == 'student_created':
481            # We do not reimport student records which have never been accessed.
482            continue
483        if row.get('status') == 'started':
484            # We do not reimport started payments.
485            continue
486        for key, value in row.items():
487            # Remove unwanted whitespaces.
488            row[key] = row[key].strip()
489            if not key in OPTIONS.keys():
490                continue
491            conv_name = OPTIONS[key]
492            converter = getattr(Converters, conv_name, None)
493            if converter is None:
494                print "WARNING: cannot find converter %s" % conv_name
495                continue
496            row[key] = converter(row[key], row)
497        try:
498            writer.writerow(row)
499        except:
500            print row['student_id']
501
502    print "Output written to %s" % output_file
503
504
505if __name__ == '__main__':
506    if len(sys.argv) != 2:
507        print 'Usage: %s <filename>' % __file__
508        sys.exit(1)
509    main()
Note: See TracBrowser for help on using the repository browser.