Source code for geonlplify

import pandas as pd
import spacy
import requests # for geocoding with photon and OSM data
import numpy as np
from random import randrange # select random cities
import pkg_resources

global importing_error
importing_error = []

try:
    nlp_en = spacy.load("en_core_web_trf") # speed up things. No need to load spacy several times
except:
    error = "Error: no spacy model found!: download the spacy en_core_web_trf: 'python -m spacy download en_core_web_trf'"
    importing_error.append(error)
global world_cities_df # global var to speed run time

[docs]def load_simplemaps(): """ Load simplemaps from https://simplemaps.com/data/world-cities CC-BY 4.0 attribution from https://simplemaps.com/data/world-cities :return: """ try: stream = pkg_resources.resource_stream(__name__, "/simplemaps/worldcities.csv") print("loading data from https://simplemaps.com/data/world-cities") return pd.read_csv(stream) except: error = "No data found!: download world-cities from simplemaps: python3 -c \"from geonlplify import download_simplemaps_data; download_simplemaps_data()\"" importing_error.append(error)
[docs]def importing_error_check(): """ Check if there is any importing issue (like missing data or spacy model) :return: """ if len(importing_error) != 0: raise ImportError
world_cities_df = load_simplemaps()
[docs]def geonlplify(text, method="spatial_synonym"): """ GeoNLPlify aims to make variations of an input sentence working on spatial information contained in words Examples: >> geonlplify.geonlplify("5 cases of avian influenza found in Montpellier") 5 cases of avian influenza found in Bangalore :param text: Input :param method: Between those 3 methods [generalization, specialization, spatial_synonym] :return: the variation of the input text """ try: importing_error_check() except: print(*importing_error, sep="\n") exit() list_sne = find_sne(text) list_geonlplify_variant = spatial_varations(list_sne, method) return replace_variants(text, list_geonlplify_variant, method)
[docs]def replace_variants(text, list_of_variant, method, conserve_n_gram=True): """ :param text: :param list_of_variant: example: - error: [{'name': 'South Sudan', 'label': 'GPE', 'start_char': 56, 'end_char': 67, 'generalization': nan, 'generalization_failed': 'sea'}] - ok: [{'name': 'New Jersey', 'label': 'GPE', 'start_char': 39, 'end_char': 49, 'generalization': 'United States'}] :return: """ if list_of_variant: # list is not empty for sne in list_of_variant: try: if sne[method] != np.nan and sne["name"] != str(sne[method]): if conserve_n_gram: if sne["name"].count(' ') == str(sne[method]).count(' '): text = text.replace(sne["name"], str(sne[method])) else: text = text.replace(sne["name"], str(sne[method])) except: pass return text
[docs]def geocode(sne_name): """ Input a place name and retrieves OSM properties. Exemple for sne_name="Montpellier": {'osm_id': 65442261, 'osm_type': 'N', 'country': 'France', 'osm_key': 'place', 'city': 'Montpellier', 'countrycode': 'FR', 'osm_value': 'city', 'postcode': '34062', 'name': 'Montpellier', 'county': 'Hérault', 'state': 'Occitanie', 'type': 'district'} :param sne_name: :return: a dictionary with OSM properties """ url_photon = "https://photon.komoot.io/api/?q=" # don't forget to force the lang: &lang=en try: reponses = requests.get(url_photon + str(sne_name) + "&lang=en") r = reponses.json()['features'][0]["properties"] except: print("error when geocoding: " + sne_name) raise Exception("Geocoding failed") return r
[docs]def find_sne(text): """ Return list of SNE :param text: input text :return: list of SNE """ list_of_sne = [] ner = nlp_en(text) for entity in ner.ents: if entity.label_ == "GPE": sne = { "name": str(entity), "label": entity.label_, "start_char": entity.start_char, "end_char": entity.end_char } list_of_sne.append(sne) return list_of_sne
def spatial_varations(list_of_sne, method="generalization"): list_of_variants_sne = [] for sne in list_of_sne: try: geocoded = geocode(sne["name"]) except: continue if method == "generalization": try: if geocoded["osm_value"] == "sea": sne["generalization"] = np.nan sne["generalization_failed"] = "sea" elif geocoded["osm_value"] == "continent": sne["generalization"] = np.nan sne["generalization_failed"] = "continent" else: # nothing special sne["generalization"] = geocoded["country"] except: # raise Exception("No country for " + sne["name"] + " | geocode: \n" + str(geocoded)) print("No country for " + sne["name"] + " | geocode: \n" + str(geocoded)) continue elif method == "specialization": if geocoded["osm_value"] == "country": try: country_cities = world_cities_df[world_cities_df["country"] == geocoded["country"]] if len(country_cities) != 0: sne["specialization"] = country_cities["city"].iloc[randrange(len(country_cities))] else: sne["specialization"] = np.nan except: print("No country for " + sne["name"] + " | geocode: \n" + str(geocoded)) continue elif method == "spatial_synonym": if geocoded["osm_value"] == "city": try: sne["spatial_synonym"] = world_cities_df["city"].iloc[randrange(len(world_cities_df))] except: print("No city for " + sne["name"] + " | geocode: \n" + str(geocoded)) sne["spatial_synonym"] = np.nan elif geocoded["osm_value"] == "country": try: sne["spatial_synonym"] = world_cities_df["country"].iloc[randrange(len(world_cities_df))] except: print("No country for " + sne["name"] + " | geocode: \n" + str(geocoded)) sne["spatial_synonym"] = np.nan else: print("The method is unknown") raise Exception("The method is unknown") list_of_variants_sne.append(sne) return list_of_variants_sne def download_simplemaps_data(): import requests, zipfile from io import BytesIO print("GeoNLPlify uses simplemaps data: https://simplemaps.com/data/world-cities.\n Please be respectfull with their license (CC-BY)") url = "https://simplemaps.com/static/data/world-cities/basic/simplemaps_worldcities_basicv1.75.zip" filename = url.split('/')[-1] req = requests.get(url) zipfile = zipfile.ZipFile(BytesIO(req.content)) import geonlplify dir = geonlplify.__path__ zipfile.extractall(dir[0] + "/simplemaps/")