2010.
[platal.git] / include / geocoding.inc.php
CommitLineData
4c906759
SJ
1<?php
2/***************************************************************************
d4c08d89 3 * Copyright (C) 2003-2010 Polytechnique.org *
4c906759
SJ
4 * http://opensource.polytechnique.org/ *
5 * *
6 * This program is free software; you can redistribute it and/or modify *
7 * it under the terms of the GNU General Public License as published by *
8 * the Free Software Foundation; either version 2 of the License, or *
9 * (at your option) any later version. *
10 * *
11 * This program is distributed in the hope that it will be useful, *
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
14 * GNU General Public License for more details. *
15 * *
16 * You should have received a copy of the GNU General Public License *
17 * along with this program; if not, write to the Free Software *
18 * Foundation, Inc., *
19 * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA *
20 ***************************************************************************/
21
22// Interface for an address geocoder. It provides support for transforming a free
23// form address into a fully structured one.
24// TODO: define and use an Address object instead of a key-value map.
25abstract class Geocoder {
26 // Geocodes @p the address, and returns the corresponding updated address.
27 // Unknown key-value pairs available in the input map are retained as-is.
28 abstract public function getGeocodedAddress(array $address);
29
73f6c165
SJ
30 // Cleans the address from its geocoded data
31 abstract public function stripGeocodingFromAddress(array $address);
32
4c906759
SJ
33 // Updates geoloc_administrativeareas, geoloc_subadministrativeareas and
34 // geoloc_localities databases with new geocoded data and returns the
35 // corresponding id.
36 static public function getAreaId(array &$address, $area)
37 {
38 static $databases = array(
39 'administrativeArea' => 'geoloc_administrativeareas',
40 'subAdministrativeArea' => 'geoloc_subadministrativeareas',
41 'locality' => 'geoloc_localities',
42 );
43
44 if (isset($address[$area . 'Name']) && isset($databases[$area])) {
45 $res = XDB::query("SELECT id
46 FROM " . $databases[$area] . "
47 WHERE name = {?}",
48 $address[$area . 'Name']);
49 if ($res->numRows() == 0) {
50 $address[$area . 'Id'] = XDB::execute("INSERT INTO " . $databases[$area] . " (name, country)
51 VALUES ({?}, {?})",
52 $address[$area . 'Name'], $address['countryId']);
53 } else {
54 $address[$area . 'Id'] = $res->fetchOneCell();
55 }
56 }
57 }
8f60459a
SJ
58
59 // Returns the part of the text preceeding the line with the postal code
60 // and the city name, within the limit of $limit number of lines.
61 static public function getFirstLines($text, $postalCode, $limit)
62 {
63 $textArray = explode("\n", $text);
64 for ($i = 0; $i < count($textArray); ++$i) {
65 if ($i > $limit || strpos($textLine, $postalCode) !== false) {
66 $limit = $i; break;
67 }
68 }
69 return implode("\n", array_slice($textArray, 0, $limit));
70 }
2aa2c77a
SJ
71
72 // Returns the number of non geocoded addresses for a user.
73 static public function countNonGeocoded($pid)
74 {
75 $res = XDB::query("SELECT COUNT(*)
76 FROM profile_addresses
77 WHERE pid = {?} AND FIND_IN_SET('home', type) AND accuracy = 0",
78 $pid);
79 return $res->fetchOneCell();
80 }
4c906759
SJ
81}
82
83// Implementation of a Geocoder using the Google Maps API. Please refer to
84// the following links for details:
85// http://code.google.com/apis/maps/documentation/services.html#Geocoding
86// http://code.google.com/intl/en/apis/maps/documentation/geocoding/
87// http://code.google.com/apis/maps/documentation/reference.html#GGeoAddressAccuracy
88//
89// It requires the properties gmaps_key and gmaps_url to be defined in section
90// Geocoder in plat/al's configuration (platal.ini & platal.conf).
91class GMapsGeocoder extends Geocoder {
92
93 // Maximum number of Geocoding calls to the Google Maps API.
94 const MAX_GMAPS_RPC_CALLS = 5;
95
96 public function getGeocodedAddress(array $address) {
97 $address = $this->prepareAddress($address);
00e5200b 98 $textAddress = $this->getTextToGeocode($address);
4c906759
SJ
99
100 // Try to geocode the full address.
101 if (($geocodedData = $this->getPlacemarkForAddress($textAddress))) {
102 return $this->getUpdatedAddress($address, $geocodedData, null);
103 }
104
105 // If the full geocoding failed, try to geocode only the final part of the address.
106 // We start by geocoding everything but the first line, and continue until we get
107 // a result. To respect the limit of GMaps calls, we ignore the first few lines
108 // if there are too many address lines.
109 $addressLines = explode("\n", $textAddress);
110 $linesCount = count($addressLines);
111 for ($i = max(1, $linesCount - self::MAX_GMAPS_RPC_CALLS + 1); $i < $linesCount; ++$i) {
112 $extraLines = implode("\n", array_slice($addressLines, 0, $i));
113 $toGeocode = implode("\n", array_slice($addressLines, $i));
114 if (($geocodedData = $this->getPlacemarkForAddress($toGeocode))) {
115 return $this->getUpdatedAddress($address, $geocodedData, $extraLines);
116 }
117 }
118
119 // No geocoding could be done, the initial address is returned as-is.
120 return $address;
121 }
122
73f6c165 123 public function stripGeocodingFromAddress(array $address) {
5a10ab14
SJ
124 unset($address['geoloc'], $address['geoloc_choice'], $address['geocodedPostalText'],
125 $address['countryId'], $address['country'], $address['administrativeAreaName'],
73f6c165
SJ
126 $address['subAdministrativeAreaName'], $address['localityName'],
127 $address['thoroughfareName'], $address['postalCode']);
128 $address['accuracy'] = 0;
129 return $address;
130 }
00e5200b 131
4c906759
SJ
132 // Updates the address with the geocoded information from Google Maps. Also
133 // cleans up the final informations.
134 private function getUpdatedAddress(array $address, array $geocodedData, $extraLines) {
135 $this->fillAddressWithGeocoding(&$address, $geocodedData);
136
137 // If the accuracy is 6, it means only the street has been gecoded
138 // but not the number, thus we need to fix it.
139 if ($address['accuracy'] == 6) {
140 $this->fixStreetNumber($address);
141 }
142
143 // We can now format the address.
144 $this->formatAddress($address, $extraLines);
145
4c906759
SJ
146 return $address;
147 }
148
149 // Retrieves the Placemark object (see #getPlacemarkFromJson()) for the @p
150 // address, by querying the Google Maps API. Returns the array on success,
151 // and null otherwise.
152 private function getPlacemarkForAddress($address) {
153 $url = $this->getGeocodingUrl($address);
154 $geoData = $this->getGeoJsonFromUrl($url);
155
156 return ($geoData ? $this->getPlacemarkFromJson($geoData) : null);
157 }
158
159 // Prepares address to be geocoded
160 private function prepareAddress($address) {
161 $address['text'] = preg_replace('/\s*\n\s*/m', "\n", trim($address['text']));
5a10ab14 162 $address['postalText'] = $this->getPostalAddress($address['text']);
4c906759
SJ
163 $address['updateTime'] = time();
164 unset($address['changed']);
165 return $address;
166 }
167
168 // Builds the Google Maps geocoder url to fetch information about @p address.
169 // Returns the built url.
170 private function getGeocodingUrl($address) {
171 global $globals;
172
173 $parameters = array(
174 'key' => $globals->geocoder->gmaps_key,
175 'sensor' => 'false', // The queried address wasn't obtained from a GPS sensor.
176 'hl' => 'fr', // Output langage.
177 'oe' => 'utf8', // Output encoding.
178 'output' => 'json', // Output format.
179 'gl' => 'fr', // Location preferences (addresses are in France by default).
180 'q' => $address, // The queries address.
181 );
182
183 return $globals->geocoder->gmaps_url . '?' . http_build_query($parameters);
184 }
185
186 // Fetches JSON-encoded data from a Google Maps API url, and decode them.
187 // Returns the json array on success, and null otherwise.
188 private function getGeoJsonFromUrl($url) {
189 global $globals;
190
191 // Prepare a backtrace object to log errors.
192 $bt = null;
193 if ($globals->debug & DEBUG_BT) {
194 if (!isset(PlBacktrace::$bt['Geoloc'])) {
195 new PlBacktrace('Geoloc');
196 }
197 $bt = &PlBacktrace::$bt['Geoloc'];
198 $bt->start($url);
199 }
200
201 // Fetch the geocoding data.
202 $rawData = file_get_contents($url);
203 if (!$rawData) {
204 if ($bt) {
205 $bt->stop(0, "Could not retrieve geocoded address from GoogleMaps.");
206 }
207 return null;
208 }
209
210 // Decode the JSON-encoded data, and check for their validity.
211 $data = json_decode($rawData, true);
212 if ($bt) {
213 $bt->stop(count($data), null, $data);
214 }
215
216 return $data;
217 }
218
219 // Extracts the most appropriate placemark from the JSON data fetched from
220 // Google Maps. Returns a Placemark array on success, and null otherwise. See
221 // http://code.google.com/apis/maps/documentation/services.html#Geocoding_Structured
222 // for details on the Placemark structure.
223 private function getPlacemarkFromJson(array $data) {
224 // Check for geocoding failures.
225 if (!isset($data['Status']['code']) || $data['Status']['code'] != 200) {
226 // TODO: handle non-200 codes in a better way, since the code might
227 // indicate a temporary error on Google's side.
228 return null;
229 }
230
231 // Check that at least one placemark was found.
232 if (count($data['Placemark']) == 0) {
233 return null;
234 }
235
236 // Extract the placemark with the best accuracy. This is not always the
237 // best result (since the same address may yield two different placemarks).
238 $result = $data['Placemark'][0];
239 foreach ($data['Placemark'] as $place) {
240 if ($place['AddressDetails']['Accuracy'] > $result['AddressDetails']['Accuracy']) {
241 $result = $place;
242 }
243 }
244
245 return $result;
246 }
247
248 // Fills the address with the geocoded data
249 private function fillAddressWithGeocoding(&$address, $geocodedData) {
250 // The geocoded address three is
251 // Country -> AdministrativeArea -> SubAdministrativeArea -> Locality -> Thoroughfare
252 // with all the possible shortcuts
253 // The address is formatted as xAL, or eXtensible Address Language, an international
254 // standard for address formatting.
255 // xAL documentation: http://www.oasis-open.org/committees/ciq/ciq.html#6
256 $address['geoloc'] = str_replace(", ", "\n", $geocodedData['address']);
257 if (isset($geocodedData['AddressDetails']['Accuracy'])) {
258 $address['accuracy'] = $geocodedData['AddressDetails']['Accuracy'];
259 }
260
261 $currentPosition = $geocodedData['AddressDetails'];
262 if (isset($currentPosition['Country'])) {
263 $currentPosition = $currentPosition['Country'];
264 $address['countryId'] = $currentPosition['CountryNameCode'];
265 $address['country'] = $currentPosition['CountryName'];
266 }
267 if (isset($currentPosition['AdministrativeArea'])) {
268 $currentPosition = $currentPosition['AdministrativeArea'];
269 $address['administrativeAreaName'] = $currentPosition['AdministrativeAreaName'];
270 }
271 if (isset($currentPosition['SubAdministrativeArea'])) {
272 $currentPosition = $currentPosition['SubAdministrativeArea'];
273 $address['subAdministrativeAreaName'] = $currentPosition['SubAdministrativeAreaName'];
274 }
275 if (isset($currentPosition['Locality'])) {
276 $currentPosition = $currentPosition['Locality'];
277 $address['localityName'] = $currentPosition['LocalityName'];
278 }
279 if (isset($currentPosition['Thoroughfare'])) {
280 $address['thoroughfareName'] = $currentPosition['Thoroughfare']['ThoroughfareName'];
281 }
282 if (isset($currentPosition['PostalCode'])) {
283 $address['postalCode'] = $currentPosition['PostalCode']['PostalCodeNumber'];
284 }
285
286 // Gets coordinates.
287 if (isset($geocodedData['Point']['coordinates'][0])) {
288 $address['latitude'] = $geocodedData['Point']['coordinates'][0];
289 }
290 if (isset($geocodedData['Point']['coordinates'][1])) {
291 $address['longitude'] = $geocodedData['Point']['coordinates'][1];
292 }
293 if (isset($geocodedData['ExtendedData']['LatLonBox']['north'])) {
294 $address['north'] = $geocodedData['ExtendedData']['LatLonBox']['north'];
295 }
296 if (isset($geocodedData['ExtendedData']['LatLonBox']['south'])) {
297 $address['south'] = $geocodedData['ExtendedData']['LatLonBox']['south'];
298 }
299 if (isset($geocodedData['ExtendedData']['LatLonBox']['east'])) {
300 $address['east'] = $geocodedData['ExtendedData']['LatLonBox']['east'];
301 }
302 if (isset($geocodedData['ExtendedData']['LatLonBox']['west'])) {
303 $address['west'] = $geocodedData['ExtendedData']['LatLonBox']['west'];
304 }
305 }
306
307 // Formats the text of the geocoded address using the unused data and
308 // compares it to the given address. If they are too different, the user
309 // will be asked to choose between them.
310 private function formatAddress(&$address, $extraLines) {
311 $same = true;
312 if ($extraLines) {
313 $address['geoloc'] = $extraLines . "\n" . $address['geoloc'];
314 }
5a10ab14 315 $address['geocodedPostalText'] = $this->getPostalAddress($address['geoloc']);
4c906759
SJ
316 $geoloc = strtoupper(preg_replace(array("/[0-9,\"'#~:;_\- ]/", "/\r\n/"),
317 array("", "\n"), $address['geoloc']));
318 $text = strtoupper(preg_replace(array("/[0-9,\"'#~:;_\- ]/", "/\r\n/"),
319 array("", "\n"), $address['text']));
320 $arrayGeoloc = explode("\n", $geoloc);
321 $arrayText = explode("\n", $text);
322 $countGeoloc = count($arrayGeoloc);
323 $countText = count($arrayText);
324
325 if (($countText > $countGeoloc) || ($countText < $countGeoloc - 1)
326 || (($countText == $countGeoloc - 1)
327 && ($arrayText[$countText - 1] == strtoupper($address['country'])))) {
328 $same = false;
329 } else {
330 for ($i = 0; $i < $countGeoloc && $i < $countText; ++$i) {
331 if (levenshtein($arrayText[$i], trim($arrayGeoloc[$i])) > 3) {
332 $same = false;
333 }
334 }
335 }
336 if ($same) {
337 $address['text'] = $address['geoloc'];
5a10ab14
SJ
338 $address['postalText'] = $address['geocodedPostalText'];
339 unset($address['geoloc'], $address['geocodedPostalText']);
eecbf7f5
SJ
340 } else {
341 $address['geoloc'] = str_replace("\n", "\r\n", $address['geoloc']);
342 $address['geocodedPostalText'] = str_replace("\n", "\r\n", $address['geocodedPostalText']);
4c906759 343 }
eecbf7f5
SJ
344 $address['text'] = str_replace("\n", "\r\n", $address['text']);
345 $address['postalText'] = str_replace("\n", "\r\n", $address['postalText']);
4c906759
SJ
346 }
347
5a10ab14
SJ
348 // Returns the address formated for postal use.
349 // The main rules are (cf AFNOR XPZ 10-011):
350 // -everything in upper case;
351 // -if there are more then than 38 characters in a lign, split it;
352 // -if there are more then than 32 characters in the description of the "street", use abbreviations.
353 private function getPostalAddress($text) {
354 static $abbreviations = array(
355 "IMPASSE" => "IMP",
356 "RUE" => "R",
357 "AVENUE" => "AV",
358 "BOULEVARD" => "BVD",
359 "ROUTE" => "R",
360 "STREET" => "ST",
361 "ROAD" => "RD",
362 );
363
364 $text = strtoupper($text);
365 $arrayText = explode("\n", $text);
366 $postalText = "";
367
368 foreach ($arrayText as $i => $lign) {
369 $postalText .= (($i == 0) ? "" : "\n");
370 if (($length = strlen($lign)) > 32) {
371 $words = explode(" ", $lign);
372 $count = 0;
373 foreach ($words as $word) {
374 if (isset($abbreviations[$word])) {
375 $word = $abbreviations[$word];
376 }
377 if ($count + ($wordLength = strlen($word)) <= 38) {
378 $postalText .= (($count == 0) ? "" : " ") . $word;
379 $count += (($count == 0) ? 0 : 1) + $wordLength;
380 } else {
381 $postalText .= "\n" . $word;
382 $count = strlen($word);
383 }
384 }
385 } else {
386 $postalText .= $lign;
387 }
388 }
389 return $postalText;
390 }
391
00e5200b
SJ
392 // Trims the name of the real country if it contains an ISO 3166-1 non-country
393 // item. For that purpose, we compare the last but one line of the address with
394 // all non-country items of ISO 3166-1.
395 private function getTextToGeocode($address)
396 {
397 $res = XDB::iterator('SELECT country, countryFR
398 FROM geoloc_countries
399 WHERE belongsTo IS NOT NULL');
400 $countries = array();
401 foreach ($res as $item) {
402 $countries[] = $item[0];
403 $countries[] = $item[1];
404 }
405 $textLines = explode("\n", $address['text']);
406 $countLines = count($textLines);
407 $needle = strtoupper(trim($textLines[$countLines - 2]));
408 $isPseudoCountry = false;
409 foreach ($countries as $country) {
410 if (strtoupper($country) == $needle) {
411 $isPseudoCountry = true;
412 break;
413 }
414 }
415
416 if ($isPseudoCountry) {
417 return $address['text'];
418 }
419 return implode("\n", array_slice($textLines, 0, -1));
420 }
421
4c906759
SJ
422 // Search for the lign from the given address that is the closest to the geocoded thoroughfareName
423 // and replaces the corresponding lign in the geocoded text by it.
424 static protected function fixStreetNumber(&$address)
425 {
426 if (isset($address['thoroughfareName'])) {
427 $thoroughfareName = $address['thoroughfareName'];
428 $thoroughfareToken = strtoupper(trim(preg_replace(array("/[,\"'#~:;_\-]/", "/\r\n/"),
429 array("", "\n"), $thoroughfareName)));
430 $geolocLines = explode("\n", $address['geoloc']);
431 $textLines = explode("\n", $address['text']);
432 $mindist = strlen($thoroughfareToken);
433 $minpos = 0;
434 $pos = 0;
435 foreach ($textLines as $i => $token) {
436 if (($l = levenshtein(strtoupper(trim(preg_replace(array("/[,\"'#~:;_\-]/", "/\r\n/"),
437 array("", "\n"), $token))),
438 $thoroughfareToken)) < $mindist) {
439 $mindist = $l;
440 $minpos = $i;
441 }
442 }
443 foreach ($geolocLines as $i => $line) {
444 if (strtoupper(trim($thoroughfareName)) == strtoupper(trim($line))) {
445 $pos = $i;
446 break;
447 }
448 }
449 $geolocLines[$pos] = $textLines[$minpos];
450 $address['geoloc'] = implode("\n", $geolocLines);
451 }
452 }
453}
454
455// vim:set et sw=4 sts=4 sws=4 foldmethod=marker enc=utf-8:
456?>