From ea647c8bce74481c1d56c4c73fc1127960b62c66 Mon Sep 17 00:00:00 2001 From: =?utf8?q?St=C3=A9phane=20Jacob?= Date: Sun, 12 Dec 2010 15:35:01 +0100 Subject: [PATCH] Improves duplicate addresses deletion. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Signed-off-by: Stéphane Jacob --- upgrade/1.0.1/merge_issues.php | 105 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 101 insertions(+), 4 deletions(-) diff --git a/upgrade/1.0.1/merge_issues.php b/upgrade/1.0.1/merge_issues.php index 30f715a..cf6f3d6 100755 --- a/upgrade/1.0.1/merge_issues.php +++ b/upgrade/1.0.1/merge_issues.php @@ -8,6 +8,99 @@ require_once '../../classes/address.php'; $globals->debug = 0; // Do not store backtraces. +$abbreviations = array( + 'commandant' => 'cdt', + 'docteur' => 'dr', + 'haut' => 'ht', + 'haute' => 'ht', + 'hauts' => 'ht', + 'hts' => 'ht', + 'general' => 'gen', + 'gal ' => 'gen ', + 'grand' => 'gd', + 'grande' => 'gd', + 'grands' => 'gd', + 'gde ' => 'gd ', + 'gds ' => 'gd ', + 'lieutenant' => 'lt', + 'marechal' => 'mal', + 'notre dame' => 'n d', + 'nouveau' => 'nouv', + 'president' => 'pdt', + 'saint' => 'st', + 'sainte' => 'st', + 'saintes' => 'st', + 'saints' => 'st', + 'ste ' => 'st ', + 'appartement' => 'app', + 'apt' => 'app', + 'appt' => 'app', + 'appart' => 'app', + 'arrondissement'=> 'arr', + 'batiment' => 'bat', + 'escalier' => 'esc', + 'etage' => 'etg', + 'et ' => 'etg', + 'immeuble' => 'imm', + 'lieu dit' => 'ld', + ' lt ' => ' lt ', + 'porte' => 'pte', + 'quartier' => 'quart', + 'residence' => 'res', + 'resi' => 'res', + 'villa' => 'vla', + 'village' => 'vlge', + 'vil ' => 'vlge ', + 'allee' => 'all', + 'avenue' => 'av', + 'boulevard' => 'bd', + 'bld' => 'bd', + 'chemin' => 'ch', + 'chem ' => 'ch ', + 'che ' => 'ch ', + 'cours' => 'crs', + 'domaine' => 'dom', + 'doma ' => 'dom ', + 'faubourg' => 'fg', + 'fbg' => 'fg', + 'hameau' => 'ham', + 'hame ' => 'ham ', + 'impasse' => 'imp', + 'impa ' => 'imp ', + 'lotissement' => 'lot', + 'montee' => 'mte', + 'passage' => 'pass', + 'place' => 'pl', + 'promenade' => 'pro ', + 'prom ' => 'pro ', + 'quai' => 'qu', + 'rue' => 'r', + 'route' => 'rte', + ' rde ' => ' rte ', + ' rle ' => ' rte ', + 'sentier' => 'sen', + 'sent ' => 'sen ', + 'square' => 'sq', + 'mount' => 'mt', + 'road' => 'rd', + 'street' => 'st', + 'str ' => 'str', + 'bis' => 'b', + 'ter' => 't' +); +$patterns = array(); +$replacements = array(); +foreach ($abbreviations as $key => $abbreviation) { + $patterns[] = '/' . $key . '/'; + $replacements[] = $abbreviation; +} + +function check($address1, $address2) +{ + return $address1['short'] == $address2['short'] || $address1['short'] == $address2['long'] + || $address1['long'] == $address2['short'] || $address1['long'] == $address2['long']; +} + print "Deletes duplicated addresses. (1/3)\n"; $pids = XDB::rawFetchColumn("SELECT DISTINCT(pid) FROM profile_addresses AS a1 @@ -26,14 +119,18 @@ foreach ($pids as $pid) { $count = 0; $it = Address::iterate(array($pid), array(Address::LINK_PROFILE), array(0)); while ($item = $it->next()) { - $addresses[] = $item; - $rawAddresses[] = preg_replace('/[^a-zA-Z0-9]/', '', replace_accent($item->text)); + $addresses[$count] = $item; + $rawAddress = preg_replace('/[^a-z0-9]/', ' ', mb_strtolower(replace_accent($item->text))); + $rawAddresses[$count] = array( + 'long' => preg_replace('/\s+/', '', $rawAddress), + 'short' => preg_replace('/\s+/', '', preg_replace($patterns, $replacements, $rawAddress)), + ); ++$count; } for ($i = 0; $i < $count; ++$i) { for ($j = $i + 1; $j < $count; ++$j) { - if ($rawAddresses[$i] == $rawAddresses[$j]) { - $duplicates[$i] = true; + if (check($rawAddresses[$i], $rawAddresses[$j])) { + $duplicates[$j] = true; } } } -- 2.1.4