[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Gnumed-devel] Numberless addresses
From: |
Jim Busser |
Subject: |
Re: [Gnumed-devel] Numberless addresses |
Date: |
Tue, 11 Oct 2011 01:14:37 -0700 |
On 2011-10-10, at 11:00 PM, Liz wrote:
> Please, never assume that a number is part of an address
>
> eg
> "Property Name"
> "Town"
>
> is quite normal even further inland from here
The appended is what I have so far, in terms of parsing of US / Commonwealth
style addresses.
It should be usable in any psql provided the alias s_c for
staging.cpsbc
and the field names, if they are different from those below, prompt adjustment.
-- Jim
--
=============================================================================================
-- dem.street
--
-- Proof (visually check) the expected results of the following parser /
splitter.
-- The parser splitter processes, on staging fields as arbitrarily named below,
as follows:
-- when UPPER(string) includes 'BOX ' or 'BAG ' (note the space, in view of ~
'Boxwood St')
-- and likewise when the string begins 'RR ' for Rural Route
-- --> we assume these act as postal instruction rather than as any physical
street
-- (we cannot help if some actual street is contained in a second address
line, a1_addr2)
-- (content in any address_line_2 can anyway go into the column
'.addendum')
-- when a likely location for the street can otherwise be reasonably
interpreted
-- then the following US / Commonwealth style conventions are handled and
assumed
-- --> with one exception ['-'] we rely below on a space to separate 'Number'
and 'Street'
-- comma separated:
-- two commas: <various idiosyncrasies>,, <Number> <Street>
-- one comma: <Number> <Street>, <Apartment>
-- hyphen-separated:
-- two hyphens: <Subunit>-<Number>-<Street>
-- one hyphen: <Subunit>-<Number> <Street>
--
-- this parser / splitter is necessarily replicated for each set a1_addr1 ...
a3_addr1
-- Suggest to preview each SELECT before proceeding to do live inserts
-- Suggest that these are easier to proof one group at a time
-- --> achieve this by appending a copy of the WHEN condition to WHERE ... AND
SELECT DISTINCT street, number, subunit FROM
(SELECT
s_c.a1_addr1,
CASE
-- when it may be a postal box
WHEN position('BOX ' in upper(s_c.a1_addr1)) > 0 or position('BAG ' in
upper(s_c.a1_addr1)) > 0
THEN s_c.a1_addr1
-- when address begins RR, this will be rural route
WHEN position('RR ' in s_c.a1_addr1) = 1 OR position('RR#' in s_c.a1_addr1)
= 1
THEN s_c.a1_addr1
-- when address contains two commas, expect the number and street to be in
the *last* part:
WHEN char_length(split_part(s_c.a1_addr1, ',', 3))>0
THEN SUBSTR(split_part(s_c.a1_addr1, ',', 3),position(' ' in
split_part(s_c.a1_addr1, ',', 3))+1)
-- when address contains one comma, expect the number and street to be in
the *first* part:
WHEN char_length(split_part(s_c.a1_addr1, ',', 2))>0
THEN SUBSTR(split_part(s_c.a1_addr1, ',', 1),position(' ' in
split_part(s_c.a1_addr1, ',', 1))+1)
-- when address contains 2 hyphens, do:
WHEN char_length(split_part(s_c.a1_addr1, '-', 3))>0 AND
char_length(split_part(s_c.a1_addr1, '-', 4))=0
THEN SUBSTR(split_part(s_c.a1_addr1, '-', 3),position(' ' in
split_part(s_c.a1_addr1, '-', 3))+1)
-- when address contains 1 hyphen, do:
WHEN char_length(split_part(s_c.a1_addr1, '-', 2))>0 AND
char_length(split_part(s_c.a1_addr1, '-', 4))=0
THEN SUBSTR(split_part(s_c.a1_addr1, '-', 2),position(' ' in
split_part(s_c.a1_addr1, '-', 2))+1)
-- when address contains no hyphen, yet a space is present, do:
WHEN char_length(split_part(s_c.a1_addr1, ' ', 2))>0
THEN SUBSTR(s_c.a1_addr1, position(' ' in s_c.a1_addr1)+1)
END as street,
CASE
-- when it may be a postal box
WHEN position('BOX ' in upper(s_c.a1_addr1)) > 0 or position('BAG ' in
upper(s_c.a1_addr1)) > 0
THEN ''
-- when address begins RR, this will be rural route
WHEN position('RR ' in s_c.a1_addr1) = 1 OR position('RR#' in s_c.a1_addr1)
= 1
THEN ''
-- when address contains two commas, expect the number and street to be in
the *last* part:
WHEN char_length(split_part(s_c.a1_addr1, ',', 3))>0
THEN TRIM(SUBSTR(split_part(s_c.a1_addr1, ',', 3),1,position(' ' in
split_part(s_c.a1_addr1, ',', 3))))
-- when address contains one comma, expect the number and street to be in
the *first* part:
WHEN char_length(split_part(s_c.a1_addr1, ',', 2))>0
THEN TRIM(SUBSTR(split_part(s_c.a1_addr1, ',', 1),1,position(' ' in
split_part(s_c.a1_addr1, ',', 1))))
-- when address contains 2 hyphens, do:
WHEN char_length(split_part(s_c.a1_addr1, '-', 3))>0 AND
char_length(split_part(s_c.a1_addr1, '-', 4))=0
THEN TRIM(SUBSTR(split_part(s_c.a1_addr1, '-', 3),1,position(' ' in
split_part(s_c.a1_addr1, '-', 3))))
-- when address contains 1 hyphen, do:
WHEN char_length(split_part(s_c.a1_addr1, '-', 2))>0 AND
char_length(split_part(s_c.a1_addr1, '-', 4))=0
THEN TRIM(SUBSTR(split_part(s_c.a1_addr1, '-', 2),1,position(' ' in
split_part(s_c.a1_addr1, '-', 2))))
-- when address contains no hyphen, yet a space is present, do:
WHEN char_length(split_part(s_c.a1_addr1, ' ', 2))>0
THEN TRIM(SUBSTR(s_c.a1_addr1, 1,position(' ' in s_c.a1_addr1)))
END as number,
CASE
-- when it may be a postal box
WHEN position('BOX ' in upper(s_c.a1_addr1)) > 0 or position('BAG ' in
upper(s_c.a1_addr1)) > 0
THEN NULL
-- when address begins RR, this will be rural route
WHEN position('RR ' in s_c.a1_addr1) = 1 OR position('RR#' in s_c.a1_addr1)
= 1
THEN NULL
-- when address contains two commas, expect the number and street to be in
the *last* part:
WHEN char_length(split_part(s_c.a1_addr1, ',', 3))>0
THEN split_part(s_c.a1_addr1, ',', 1) || ',' || split_part(s_c.a1_addr1,
',', 2)
-- when address contains one comma, expect the number and street to be in
the *first* part:
WHEN char_length(split_part(s_c.a1_addr1, ',', 2))>0
THEN split_part(s_c.a1_addr1, ',', 1)
-- when address contains 2 hyphens, expect the number and street to be in
the *last* part:
WHEN char_length(split_part(s_c.a1_addr1, '-', 3))>0 AND
char_length(split_part(s_c.a1_addr1, '-', 4))=0
THEN split_part(s_c.a1_addr1, '-', 1) || ',' || split_part(s_c.a1_addr1,
'-', 2)
-- when address contains 1 hyphen, do:
WHEN char_length(split_part(s_c.a1_addr1, '-', 2))>0 AND
char_length(split_part(s_c.a1_addr1, '-', 4))=0
THEN split_part(s_c.a1_addr1, '-', 1)
-- when address contains no hyphen, yet a space is present, do:
WHEN char_length(split_part(s_c.a1_addr1, ' ', 2))>0
THEN NULL
END as subunit,
s_c.a1_addr2 as addendum,
s_c.a1_city as city,
s_c.a1_prov as state,
s_c.a1_postalcode as postcode,
s_c.a1_country as country
FROM staging.cpsbc s_c
-- WHERE ... condition by which to better examine candidate streets e.g.
WHERE char_length(split_part(s_c.a1_addr1, ' ', 2))>0
ORDER BY street, number
) AS virtual -- <--- else get 'subquery in FROM must have an alias'
INNER JOIN dem.urb d_u
ON UPPER(virtual.city)=UPPER(d_u.name)
INNER JOIN dem.state d_state
ON UPPER(virtual.state)=UPPER(d_state.code)
INNER JOIN dem.country d_c
ON UPPER(virtual.country)=UPPER(d_c.name)
WHERE UPPER(virtual.street) not in (SELECT UPPER(name) FROM dem.street)
ORDER BY street, number, subunit
;