[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Gnumed-devel] dbgen import csv scripts for gnumed.
From: |
sjtan |
Subject: |
[Gnumed-devel] dbgen import csv scripts for gnumed. |
Date: |
Mon, 23 Aug 2004 11:35:23 +1000 |
User-agent: |
Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.7) Gecko/20040616 |
correct_postcodes.py does some validation of urb, state, postcode , and
birthdate ,
then csv_to_gnumed.py imports a dbgen csv file into a test gnumed database.
The intention is to run do a project-dedup febrl run on a dataset
residing on a gnumed schema.
"""reads a febrl dbgen csv file , and outputs a gnumed compatible postcode/urb
combination correct file to standard output , can be redirected to another csv
file"""
# run in the dbgen directory
csv='../dbgen/dataset1.csv'
import pgdb
import datetime
def validate_date( dobnum):
if len(dobnum) < 8:
return '20040101'
y, m, d = int(dobnum[0:4]), int(dobnum[4:6]), int(dobnum[6:])
""" thirty days has september,
april , june and november;
all the rest have 31,
except february which has 28"""
if m < 1 : m = 1
elif m > 12 : m = 12
if m in [9, 4, 6, 11]: days = range(1,31)
elif m == 2: days = range(1,29)
else: days = range(1, 32)
if d < days[0] : d = days[0]
elif d > days[-1]: d= days[-1]
return datetime.datetime(y,m,d).strftime('%Y%m%d')
c=pgdb.connect('localhost:gnumed')
cu = c.cursor()
pcodes = {}
states = {}
citys = {}
title = 1
for l in file(csv):
#print l
if title:
title = 0
print l.strip()
continue
[rec_id, firstnames, lastnames, number, street1, street2, city,
postcode, state, dobnum, tel] = [x.strip() for x in l.split(',')]
#print type(postcode), postcode
if len(postcode) == 0:
continue
pcode = int(postcode)
if not pcodes.has_key(pcode):
cu.execute("select urb.name, state.code from urb, state where
postcode= %d and state.id = urb.id_state" % pcode)
l = cu.fetchall()
pcodes[pcode] = [ x[0].strip() for x in l]
states[pcode] = [ x[1].strip() for x in l]
city = city.upper()
if not city in pcodes[pcode] and len(pcodes[pcode]):
#print "city", city, " doesn't match ", pcode, pcodes[pcode]
city = pcodes[pcode][0]
elif not city in pcodes[pcode] and len(pcodes[pcode]) == 0:
cu.execute("select urb.postcode, state.code from urb, state
where urb.name = '%s' and state.id = urb.id_state" % city)
l = cu.fetchone()
if not l:
continue
if len(l) == 2:
pcode, state = int(l[0]), l[1]
states[pcode] = [state]
state = state.upper()
if not state in states[pcode] and len(states[pcode]):
#print "state ", state, " isn't in ", states[pcode]
state = states[pcode][0]
dobnum = validate_date(dobnum)
print ', '.join( [ rec_id, firstnames, lastnames, number, street1,
street2, city, postcode, state, dobnum, tel] )
"""run correct_postcodes.py on say datase1.csv and redirect output to the file
named in the csv variable below"""
#the file name that captured the standard output of correct_csv.py
csv='dataset1-gnumed.csv'
soc_sec_org = 'Centrelink'
import pgdb
def escape_quote(txt, j=0 , k=0):
if k == 0:
k = len(txt)
for i in range(j, k):
if txt[i] == "'":
txt = txt[:i] + '\\'+txt[i:]
return escape_quote(txt, i + 2, len(txt))
return txt
c = pgdb.connect('localhost:gnumed')
cu = c.cursor()
title = 1
for l in file(csv):
if title:
title = 0
continue
print l
[rec_id, firstnames, lastnames, number, street1, street2, city,
postcode, state, dobnum, soc_sec_id] = [x.strip() for x in l.split(',')]
if dobnum <> '':
dob = int(dobnum)
else:
dob = 20040101
c.commit()
cu.execute("select nextval('identity_id_seq')")
[id] = cu.fetchone()
cu.execute("insert into identity ( id, dob) values( %d, '%d')" % (id,
dob) )
cu.execute("insert into names( lastnames, firstnames, id_identity)
values( '%s', '%s', %d)" % ( escape_quote(lastnames),
escape_quote(firstnames), id) )
cu.execute("insert into v_basic_address( number, street, street2, city,
postcode, state, country) values ( '%s', '%s', '%s', '%s', '%s', '%s', 'AU' ) "
% ( number, escape_quote(street1), escape_quote(street2),
escape_quote(city.upper()), postcode, state.upper()) )
cu.execute("select currval('address_id_seq')")
[id_address] = cu.fetchone()
cu.execute("insert into lnk_person_org_address( id_identity,
id_address, id_type) values( %d, %d, 1) " % ( id, id_address) )
cu.execute("insert into lnk_identity2ext_id ( external_id, fk_origin,
id_identity) values ( '%s', (select pk from enum_ext_id_types where issuer=
'%s'), %d)" % ( soc_sec_id, soc_sec_org, id) )
# CHANGE THIS TO COMMENT TO PUT TEST DATA IN GNUMED DB. ONLY ON A TEST
DATABASE, AS NOT REVERSIBLE.
#c.commit()
c.rollback()
- [Gnumed-devel] dbgen import csv scripts for gnumed.,
sjtan <=