#!/usr/bin/perl # # Copyright (c) 2006 Dominic Hargreaves # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), # to deal in the Software without restriction, including without limitation # the rights to use, copy, modify, merge, publish, distribute, sublicense, # and/or sell copies of the Software, and to permit persons to whom the # Software is furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS # IN THE SOFTWARE. # # Notes on usage: # This is fairly rough and ready. I lost the will to refine it any further # since it's essentially a one-shot affair. # You need to run it in two passes: # - import-postuk < postuk.csv # - import-postuk multi < postuk.csv # This is because many of the dupe districts are in the lines where one # town is listed for many districts and we prefer not to use them # If we add them last they'll be caught by the uniqueness requirement # of the DB # # There are a few other cases in the data I'm using where there are dupes # We don't really need to care that much about them. use strict; use warnings; use FindBin; use Text::CSV; # Find our private perl libraries use lib "$FindBin::Bin/../../perllib"; use NPEMap::Postcodes; use NPEMap; use Data::Dumper; my $multi; if ($ARGV[0] && ($ARGV[0] eq 'multi')) { $multi = 1; } my $dbh = setup_dbh() or die $!; my $csv = Text::CSV->new; my $sth = $dbh->prepare('INSERT INTO districts (outward, district,county) VALUES (?, ?, ?)'); #"Aberdeen","Aberdeenshire","AB1, AB2","Aberdeen","Scotland","" #"Blackburn","Lancashire","BB1, BB2 ... BB6","Blackburn","England","" while () { my $rawline = $_; # God this data is shoddy s// /; $csv->parse($_); my @fields = $csv->fields; if ($fields[0] and $fields[2]) { $fields[2] =~ s/(\w) (\w)/$1, $2/; $fields[2] =~ s/(\w),(\w)/$1, $2/; my @codes = split /, /, $fields[2]; if ((scalar(@codes) > 1) or $fields[2] =~ /\.\.\./) { next unless $multi; } else { next if $multi; } foreach my $code (@codes) { if ($code =~ /(\S+) \.\.\. (\S+)/) { my $first = $1; my $second = $2; $first =~ /(\w+?)(\d+)/; my $first_alpha = $1; my $first_num = $2; $second =~ /(\w+?)(\d+)/; my $second_alpha = $1; my $second_num = $2; $first_alpha =~ s/\W+//; $second_alpha =~ s/\W+//; die "bad loop" if ($first_alpha ne $second_alpha); for ($first_num .. $second_num) { $_ =~ s/\W+//; my $munged_code = $first_alpha . $_; if (!$sth->execute($munged_code, $fields[0], $fields[1])) { print STDERR $munged_code . "\n"; } } } else { $code =~ s/\W+//; if (!$sth->execute($code, $fields[0], $fields[1])) { print STDERR $code . "\n"; } } } } } $dbh->disconnect;