gnumed-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Gnumed-devel] dbgen import csv scripts for gnumed.


From: sjtan
Subject: [Gnumed-devel] dbgen import csv scripts for gnumed.
Date: Mon, 23 Aug 2004 11:35:23 +1000
User-agent: Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.7) Gecko/20040616

correct_postcodes.py does some validation of urb, state, postcode , and birthdate ,
then  csv_to_gnumed.py imports a dbgen csv file into a test gnumed database.

The intention is to run do a project-dedup febrl run on a dataset residing on a gnumed schema.


"""reads a febrl dbgen csv file , and outputs a gnumed compatible postcode/urb 
combination correct file to standard output , can be redirected to another csv 
file"""

# run in the dbgen directory
csv='../dbgen/dataset1.csv'

import pgdb
import datetime

def validate_date( dobnum):
        if len(dobnum) < 8:
                return '20040101'
        y, m, d =  int(dobnum[0:4]), int(dobnum[4:6]), int(dobnum[6:]) 
        
        """     thirty days has september, 
                april , june and november; 
                all the rest have 31, 
                except february which has 28"""
                
        if m < 1 : m = 1 
        elif m > 12 : m = 12

        if m in [9, 4, 6, 11]: days = range(1,31) 
        elif m == 2: days = range(1,29)
        else: days = range(1, 32)

        if d < days[0] : d = days[0]
        elif d > days[-1]: d= days[-1]
        
        return datetime.datetime(y,m,d).strftime('%Y%m%d')      


c=pgdb.connect('localhost:gnumed')
cu = c.cursor()
pcodes = {}
states = {}
citys = {}
title = 1
for l in file(csv):
        
        #print l
        if title:
                title = 0
                print l.strip()
                continue
        [rec_id, firstnames, lastnames, number, street1, street2, city, 
postcode, state, dobnum, tel] = [x.strip() for x in l.split(',')]
        #print type(postcode), postcode
        if len(postcode) == 0:
                continue
        pcode = int(postcode)
        if not pcodes.has_key(pcode):
                cu.execute("select urb.name, state.code from urb, state where 
postcode= %d and state.id = urb.id_state" % pcode)
                l = cu.fetchall()
                pcodes[pcode] = [ x[0].strip() for x in l]
                states[pcode] = [ x[1].strip() for x in l]
        city = city.upper()
        if not city in pcodes[pcode] and len(pcodes[pcode]):
                #print "city", city, " doesn't match ", pcode, pcodes[pcode]
                        city = pcodes[pcode][0]
        elif not city in pcodes[pcode] and len(pcodes[pcode]) == 0:
                cu.execute("select urb.postcode, state.code from urb, state 
where urb.name = '%s' and state.id = urb.id_state" % city)
                l = cu.fetchone()
                if not l:
                        continue
                if len(l) == 2:
                        pcode, state = int(l[0]), l[1]
                        states[pcode] = [state]

                
        state = state.upper()
        if not state in states[pcode] and len(states[pcode]):
                #print "state ", state, " isn't in ", states[pcode]
                state = states[pcode][0]

                
        dobnum = validate_date(dobnum)

        print ', '.join( [ rec_id, firstnames, lastnames, number, street1, 
street2, city, postcode, state, dobnum, tel] )


        

        

        
        
"""run correct_postcodes.py on say datase1.csv and redirect output to the file 
named in the csv variable below"""

#the file name that captured the standard output of correct_csv.py
csv='dataset1-gnumed.csv'

soc_sec_org = 'Centrelink'
import pgdb

def escape_quote(txt, j=0 , k=0):
        if k == 0:
                k = len(txt)

        for i in range(j, k):
                if txt[i] == "'":
                        txt = txt[:i] + '\\'+txt[i:]
                        return escape_quote(txt, i + 2, len(txt))
        return txt      

c = pgdb.connect('localhost:gnumed')
cu = c.cursor()
title = 1
for l in file(csv):
        if title:
                title = 0
                continue
        print l
        [rec_id, firstnames, lastnames, number, street1, street2, city, 
postcode, state, dobnum, soc_sec_id] = [x.strip() for x in l.split(',')]
        if dobnum <> '':
                dob = int(dobnum)
        else:
                dob = 20040101
        c.commit()
        cu.execute("select nextval('identity_id_seq')")
        [id] = cu.fetchone()
        
        cu.execute("insert into identity ( id, dob) values( %d, '%d')" % (id, 
dob) )
        cu.execute("insert into names( lastnames, firstnames, id_identity) 
values( '%s', '%s', %d)"  % ( escape_quote(lastnames), 
escape_quote(firstnames), id) )
        cu.execute("insert into v_basic_address( number, street, street2, city, 
postcode, state, country) values ( '%s', '%s', '%s', '%s', '%s', '%s', 'AU' ) " 
% ( number, escape_quote(street1), escape_quote(street2), 
escape_quote(city.upper()), postcode, state.upper()) )
        cu.execute("select currval('address_id_seq')")
        [id_address] = cu.fetchone()
        cu.execute("insert into lnk_person_org_address( id_identity, 
id_address, id_type) values( %d, %d, 1) " % ( id, id_address) )

        cu.execute("insert into lnk_identity2ext_id ( external_id, fk_origin, 
id_identity) values ( '%s', (select pk from enum_ext_id_types where issuer= 
'%s'), %d)" % ( soc_sec_id, soc_sec_org, id) )

        # CHANGE THIS TO COMMENT TO PUT TEST DATA IN GNUMED DB. ONLY ON A TEST 
DATABASE, AS NOT REVERSIBLE.
        #c.commit()
        c.rollback()


        
        
        



reply via email to

[Prev in Thread] Current Thread [Next in Thread]