gnumed-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Gnumed-devel] Numberless addresses


From: Jim Busser
Subject: Re: [Gnumed-devel] Numberless addresses
Date: Tue, 11 Oct 2011 01:14:37 -0700

On 2011-10-10, at 11:00 PM, Liz wrote:

> Please, never assume that a number is part of an address
> 
> eg 
> "Property Name"
> "Town"
> 
> is quite normal even further inland from here

The appended is what I have so far, in terms of parsing of US / Commonwealth 
style addresses.

It should be usable in any psql provided the alias s_c for

        staging.cpsbc

and the field names, if they are different from those below, prompt adjustment.

-- Jim


-- 
=============================================================================================
-- dem.street
-- 
-- Proof (visually check) the expected results of the following parser / 
splitter.
-- The parser splitter processes, on staging fields as arbitrarily named below, 
as follows:
--   when UPPER(string) includes 'BOX ' or 'BAG ' (note the space, in view of ~ 
'Boxwood St')
--   and likewise when the string begins 'RR ' for Rural Route
--   --> we assume these act as postal instruction rather than as any physical 
street
--       (we cannot help if some actual street is contained in a second address 
line, a1_addr2)
--       (content in any address_line_2 can anyway go into the column 
'.addendum')
--   when a likely location for the street can otherwise be reasonably 
interpreted
--   then the following US / Commonwealth style conventions are handled and 
assumed
--   --> with one exception ['-'] we rely below on a space to separate 'Number' 
and 'Street'
--       comma separated:
--         two commas: <various idiosyncrasies>,, <Number> <Street>
--         one comma: <Number> <Street>, <Apartment>
--       hyphen-separated:
--         two hyphens: <Subunit>-<Number>-<Street>
--         one hyphen: <Subunit>-<Number> <Street>
--         
-- this parser / splitter is necessarily replicated for each set a1_addr1 ... 
a3_addr1
-- Suggest to preview each SELECT before proceeding to do live inserts
-- Suggest that these are easier to proof one group at a time
-- --> achieve this by appending a copy of the WHEN condition to WHERE ... AND

SELECT DISTINCT street, number, subunit FROM
(SELECT
  s_c.a1_addr1,
  CASE
    -- when it may be a postal box
    WHEN position('BOX ' in upper(s_c.a1_addr1)) > 0 or position('BAG ' in 
upper(s_c.a1_addr1)) > 0
      THEN s_c.a1_addr1
    -- when address begins RR, this will be rural route
    WHEN position('RR ' in s_c.a1_addr1) = 1 OR position('RR#' in s_c.a1_addr1) 
= 1
      THEN s_c.a1_addr1   
    -- when address contains two commas, expect the number and street to be in 
the *last* part:
    WHEN char_length(split_part(s_c.a1_addr1, ',', 3))>0
      THEN SUBSTR(split_part(s_c.a1_addr1, ',', 3),position(' ' in 
split_part(s_c.a1_addr1, ',', 3))+1)    
    -- when address contains one comma, expect the number and street to be in 
the *first* part:
    WHEN char_length(split_part(s_c.a1_addr1, ',', 2))>0
      THEN SUBSTR(split_part(s_c.a1_addr1, ',', 1),position(' ' in 
split_part(s_c.a1_addr1, ',', 1))+1)    
    -- when address contains 2 hyphens, do:
    WHEN char_length(split_part(s_c.a1_addr1, '-', 3))>0 AND 
char_length(split_part(s_c.a1_addr1, '-', 4))=0
      THEN SUBSTR(split_part(s_c.a1_addr1, '-', 3),position(' ' in 
split_part(s_c.a1_addr1, '-', 3))+1)
    -- when address contains 1 hyphen, do:
    WHEN char_length(split_part(s_c.a1_addr1, '-', 2))>0 AND 
char_length(split_part(s_c.a1_addr1, '-', 4))=0
      THEN SUBSTR(split_part(s_c.a1_addr1, '-', 2),position(' ' in 
split_part(s_c.a1_addr1, '-', 2))+1)
    -- when address contains no hyphen, yet a space is present, do:
    WHEN char_length(split_part(s_c.a1_addr1, ' ', 2))>0
      THEN SUBSTR(s_c.a1_addr1, position(' ' in s_c.a1_addr1)+1)
  END as street,
  CASE
    -- when it may be a postal box
    WHEN position('BOX ' in upper(s_c.a1_addr1)) > 0 or position('BAG ' in 
upper(s_c.a1_addr1)) > 0
      THEN ''
    -- when address begins RR, this will be rural route
    WHEN position('RR ' in s_c.a1_addr1) = 1 OR position('RR#' in s_c.a1_addr1) 
= 1
      THEN ''   
    -- when address contains two commas, expect the number and street to be in 
the *last* part:
    WHEN char_length(split_part(s_c.a1_addr1, ',', 3))>0
      THEN TRIM(SUBSTR(split_part(s_c.a1_addr1, ',', 3),1,position(' ' in 
split_part(s_c.a1_addr1, ',', 3))))   
    -- when address contains one comma, expect the number and street to be in 
the *first* part:
    WHEN char_length(split_part(s_c.a1_addr1, ',', 2))>0
      THEN TRIM(SUBSTR(split_part(s_c.a1_addr1, ',', 1),1,position(' ' in 
split_part(s_c.a1_addr1, ',', 1))))    
    -- when address contains 2 hyphens, do:
    WHEN char_length(split_part(s_c.a1_addr1, '-', 3))>0 AND 
char_length(split_part(s_c.a1_addr1, '-', 4))=0
      THEN TRIM(SUBSTR(split_part(s_c.a1_addr1, '-', 3),1,position(' ' in 
split_part(s_c.a1_addr1, '-', 3))))
    -- when address contains 1 hyphen, do:
    WHEN char_length(split_part(s_c.a1_addr1, '-', 2))>0 AND 
char_length(split_part(s_c.a1_addr1, '-', 4))=0
      THEN TRIM(SUBSTR(split_part(s_c.a1_addr1, '-', 2),1,position(' ' in 
split_part(s_c.a1_addr1, '-', 2))))
    -- when address contains no hyphen, yet a space is present, do:
    WHEN char_length(split_part(s_c.a1_addr1, ' ', 2))>0
      THEN TRIM(SUBSTR(s_c.a1_addr1, 1,position(' ' in s_c.a1_addr1)))
  END as number,
  CASE
    -- when it may be a postal box
    WHEN position('BOX ' in upper(s_c.a1_addr1)) > 0 or position('BAG ' in 
upper(s_c.a1_addr1)) > 0
      THEN NULL
    -- when address begins RR, this will be rural route
    WHEN position('RR ' in s_c.a1_addr1) = 1 OR position('RR#' in s_c.a1_addr1) 
= 1
      THEN NULL   
    -- when address contains two commas, expect the number and street to be in 
the *last* part:
    WHEN char_length(split_part(s_c.a1_addr1, ',', 3))>0
      THEN split_part(s_c.a1_addr1, ',', 1) || ',' || split_part(s_c.a1_addr1, 
',', 2)   
    -- when address contains one comma, expect the number and street to be in 
the *first* part:
    WHEN char_length(split_part(s_c.a1_addr1, ',', 2))>0
      THEN split_part(s_c.a1_addr1, ',', 1)
    -- when address contains 2 hyphens, expect the number and street to be in 
the *last* part:
    WHEN char_length(split_part(s_c.a1_addr1, '-', 3))>0 AND 
char_length(split_part(s_c.a1_addr1, '-', 4))=0
      THEN split_part(s_c.a1_addr1, '-', 1) || ',' || split_part(s_c.a1_addr1, 
'-', 2)
    -- when address contains 1 hyphen, do:
    WHEN char_length(split_part(s_c.a1_addr1, '-', 2))>0 AND 
char_length(split_part(s_c.a1_addr1, '-', 4))=0
      THEN split_part(s_c.a1_addr1, '-', 1)
    -- when address contains no hyphen, yet a space is present, do:
    WHEN char_length(split_part(s_c.a1_addr1, ' ', 2))>0
      THEN NULL
  END as subunit,
  s_c.a1_addr2 as addendum,
  s_c.a1_city as city,
  s_c.a1_prov as state,
  s_c.a1_postalcode as postcode,
  s_c.a1_country as country
FROM staging.cpsbc s_c
-- WHERE ... condition by which to better examine candidate streets e.g.
WHERE char_length(split_part(s_c.a1_addr1, ' ', 2))>0
ORDER BY street, number
) AS virtual -- <--- else get 'subquery in FROM must have an alias'
INNER JOIN dem.urb d_u
ON UPPER(virtual.city)=UPPER(d_u.name)
INNER JOIN dem.state d_state
ON UPPER(virtual.state)=UPPER(d_state.code)
INNER JOIN dem.country d_c
ON UPPER(virtual.country)=UPPER(d_c.name)
WHERE UPPER(virtual.street) not in (SELECT UPPER(name) FROM dem.street)
ORDER BY street, number, subunit
;




reply via email to

[Prev in Thread] Current Thread [Next in Thread]