Defaultly uses the address given by the user; allows bigger differences between the...
[platal.git] / include / geocoding.inc.php
index fc812fa..3e193e6 100644 (file)
@@ -1,6 +1,6 @@
 <?php
 /***************************************************************************
- *  Copyright (C) 2003-2009 Polytechnique.org                              *
+ *  Copyright (C) 2003-2010 Polytechnique.org                              *
  *  http://opensource.polytechnique.org/                                   *
  *                                                                         *
  *  This program is free software; you can redistribute it and/or modify   *
@@ -47,9 +47,10 @@ abstract class Geocoder {
                                 WHERE  name = {?}",
                               $address[$area . 'Name']);
             if ($res->numRows() == 0) {
-                $address[$area . 'Id'] = XDB::execute("INSERT INTO  " . $databases[$area] . " (name, country)
-                                                            VALUES  ({?}, {?})",
-                                                      $address[$area . 'Name'], $address['countryId']);
+                XDB::execute('INSERT INTO  ' . $databases[$area] . ' (name, country)
+                                   VALUES  ({?}, {?})',
+                             $address[$area . 'Name'], $address['countryId']);
+                $address[$area . 'Id'] = XDB::insertId();
             } else {
                 $address[$area . 'Id'] = $res->fetchOneCell();
             }
@@ -68,6 +69,16 @@ abstract class Geocoder {
         }
         return implode("\n", array_slice($textArray, 0, $limit));
     }
+
+    // Returns the number of non geocoded addresses for a user.
+    static public function countNonGeocoded($pid)
+    {
+        $res = XDB::query("SELECT  COUNT(*)
+                             FROM  profile_addresses
+                            WHERE  pid = {?} AND FIND_IN_SET('home', type) AND accuracy = 0",
+                          $pid);
+        return $res->fetchOneCell();
+    }
 }
 
 // Implementation of a Geocoder using the Google Maps API. Please refer to
@@ -82,10 +93,14 @@ class GMapsGeocoder extends Geocoder {
 
     // Maximum number of Geocoding calls to the Google Maps API.
     const MAX_GMAPS_RPC_CALLS = 5;
+    // Maximum levenshtein distance authorized between input and geocoded text in a single line.
+    const MAX_LINE_DISTANCE = 5;
+    // Maximum levenshtein distance authorized between input and geocoded text in the whole text.
+    const MAX_TOTAL_DISTANCE = 6;
 
     public function getGeocodedAddress(array $address) {
         $address = $this->prepareAddress($address);
-        $textAddress = $address['text'];
+        $textAddress = $this->getTextToGeocode($address);
 
         // Try to geocode the full address.
         if (($geocodedData = $this->getPlacemarkForAddress($textAddress))) {
@@ -111,32 +126,19 @@ class GMapsGeocoder extends Geocoder {
     }
 
     public function stripGeocodingFromAddress(array $address) {
-        unset($address['geoloc'], $address['geoloc_choice'], $address['countryId'],
-              $address['country'], $address['administrativeAreaName'],
+        unset($address['geoloc'], $address['geoloc_choice'], $address['geocodedPostalText'],
+              $address['countryId'], $address['country'], $address['administrativeAreaName'],
               $address['subAdministrativeAreaName'], $address['localityName'],
               $address['thoroughfareName'], $address['postalCode']);
         $address['accuracy'] = 0;
         return $address;
     }
+
     // Updates the address with the geocoded information from Google Maps. Also
     // cleans up the final informations.
     private function getUpdatedAddress(array $address, array $geocodedData, $extraLines) {
         $this->fillAddressWithGeocoding(&$address, $geocodedData);
-
-        // If the accuracy is 6, it means only the street has been gecoded
-        // but not the number, thus we need to fix it.
-        if ($address['accuracy'] == 6) {
-            $this->fixStreetNumber($address);
-        }
-
-        // We can now format the address.
         $this->formatAddress($address, $extraLines);
-
-        // Some entities in ISO 3166 are not countries, thus they have to be replaced
-        // by the country they belong to.
-        // TODO: fixCountry($address);
-
         return $address;
     }
 
@@ -153,7 +155,7 @@ class GMapsGeocoder extends Geocoder {
     // Prepares address to be geocoded
     private function prepareAddress($address) {
         $address['text'] = preg_replace('/\s*\n\s*/m', "\n", trim($address['text']));
-        // TODO: $address['postalAddress'] = getPostalAddress($address['text']);
+        $address['postalText'] = $this->getPostalAddress($address['text']);
         $address['updateTime'] = time();
         unset($address['changed']);
         return $address;
@@ -306,6 +308,7 @@ class GMapsGeocoder extends Geocoder {
         if ($extraLines) {
             $address['geoloc'] = $extraLines . "\n" . $address['geoloc'];
         }
+        $address['geocodedPostalText'] = $this->getPostalAddress($address['geoloc']);
         $geoloc = strtoupper(preg_replace(array("/[0-9,\"'#~:;_\- ]/", "/\r\n/"),
                                           array("", "\n"), $address['geoloc']));
         $text   = strtoupper(preg_replace(array("/[0-9,\"'#~:;_\- ]/", "/\r\n/"),
@@ -315,53 +318,104 @@ class GMapsGeocoder extends Geocoder {
         $countGeoloc = count($arrayGeoloc);
         $countText   = count($arrayText);
 
+        $totalDistance = 0;
         if (($countText > $countGeoloc) || ($countText < $countGeoloc - 1)
             || (($countText == $countGeoloc - 1)
                 && ($arrayText[$countText - 1] == strtoupper($address['country'])))) {
             $same = false;
         } else {
             for ($i = 0; $i < $countGeoloc && $i < $countText; ++$i) {
-                if (levenshtein($arrayText[$i], trim($arrayGeoloc[$i])) > 3) {
+                $lineDistance = levenshtein($arrayText[$i], trim($arrayGeoloc[$i]));
+                $totalDistance += $lineDistance;
+                if ($lineDistance > self::MAX_LINE_DISTANCE || $totalDistance > self::MAX_TOTAL_DISTANCE) {
                     $same = false;
+                    break;
                 }
             }
         }
+
         if ($same) {
-            $address['text'] = $address['geoloc'];
-            unset($address['geoloc']);
+            unset($address['geoloc'], $address['geocodedPostalText']);
+        } else {
+            $address['geoloc'] = str_replace("\n", "\r\n", $address['geoloc']);
+            $address['geocodedPostalText'] = str_replace("\n", "\r\n", $address['geocodedPostalText']);
         }
+        $address['text'] = str_replace("\n", "\r\n", $address['text']);
+        $address['postalText'] = str_replace("\n", "\r\n", $address['postalText']);
     }
  
-    // Search for the lign from the given address that is the closest to the geocoded thoroughfareName
-    // and replaces the corresponding lign in the geocoded text by it.
-    static protected function fixStreetNumber(&$address)
-    {
-        if (isset($address['thoroughfareName'])) {
-            $thoroughfareName  = $address['thoroughfareName'];
-            $thoroughfareToken = strtoupper(trim(preg_replace(array("/[,\"'#~:;_\-]/", "/\r\n/"),
-                                                              array("", "\n"), $thoroughfareName)));
-            $geolocLines = explode("\n", $address['geoloc']);
-            $textLines   = explode("\n", $address['text']);
-            $mindist = strlen($thoroughfareToken);
-            $minpos  = 0;
-            $pos     = 0;
-            foreach ($textLines as $i => $token) {
-                if (($l = levenshtein(strtoupper(trim(preg_replace(array("/[,\"'#~:;_\-]/", "/\r\n/"),
-                                                                   array("", "\n"), $token))),
-                                      $thoroughfareToken)) < $mindist) {
-                    $mindist = $l;
-                    $minpos  = $i;
+    // Returns the address formated for postal use.
+    // The main rules are (cf AFNOR XPZ 10-011):
+    // -everything in upper case;
+    // -if there are more then than 38 characters in a lign, split it;
+    // -if there are more then than 32 characters in the description of the "street", use abbreviations.
+    private function getPostalAddress($text) {
+         static $abbreviations = array(
+             "IMPASSE"   => "IMP",
+             "RUE"       => "R",
+             "AVENUE"    => "AV",
+             "BOULEVARD" => "BVD",
+             "ROUTE"     => "R",
+             "STREET"    => "ST",
+             "ROAD"      => "RD",
+             );
+
+        $text = strtoupper($text);
+        $arrayText = explode("\n", $text);
+        $postalText = "";
+
+        foreach ($arrayText as $i => $lign) {
+            $postalText .= (($i == 0) ? "" : "\n");
+            if (($length = strlen($lign)) > 32) {
+                $words = explode(" ", $lign);
+                $count = 0;
+                foreach ($words as $word) {
+                    if (isset($abbreviations[$word])) {
+                        $word = $abbreviations[$word];
+                    }
+                    if ($count + ($wordLength = strlen($word)) <= 38) {
+                        $postalText .= (($count == 0) ? "" : " ") . $word;
+                        $count += (($count == 0) ? 0 : 1) + $wordLength;
+                    } else {
+                        $postalText .= "\n" . $word;
+                        $count = strlen($word);
+                    }
                 }
+            } else {
+                $postalText .= $lign;
             }
-            foreach ($geolocLines as $i => $line) {
-                if (strtoupper(trim($thoroughfareName)) == strtoupper(trim($line))) {
-                    $pos = $i;
-                    break;
-                }
+        }
+        return $postalText;
+    }
+
+    // Trims the name of the real country if it contains an ISO 3166-1 non-country
+    // item. For that purpose, we compare the last but one line of the address with
+    // all non-country items of ISO 3166-1.
+    private function getTextToGeocode($address)
+    {
+        $res = XDB::iterator('SELECT  country, countryFR
+                                FROM  geoloc_countries
+                               WHERE  belongsTo IS NOT NULL');
+        $countries = array();
+        foreach ($res as $item) {
+            $countries[] = $item[0];
+            $countries[] = $item[1];
+        }
+        $textLines  = explode("\n", $address['text']);
+        $countLines = count($textLines);
+        $needle     = strtoupper(trim($textLines[$countLines - 2]));
+        $isPseudoCountry = false;
+        foreach ($countries as $country) {
+            if (strtoupper($country) == $needle) {
+                $isPseudoCountry = true;
+                break;
             }
-            $geolocLines[$pos] = $textLines[$minpos];
-            $address['geoloc'] = implode("\n", $geolocLines);
         }
+
+        if ($isPseudoCountry) {
+            return implode("\n", array_slice($textLines, 0, -1));
+        }
+        return $address['text'];
     }
 }