I would like to use more detailed db for geoparsing. But I am facing with some problems.
when I run the script:
import soton_corenlppy
import geoparsepy
import logging
import nltk
nltk.download('stopwords')
logger = logging.getLogger("geoparsepy")
logging.basicConfig(level=logging.INFO, format="%(message)s")
logger.info('Logging started')
geospatial_config = geoparsepy.geo_parse_lib.get_geoparse_config(
lang_codes=['it', 'en'],
logger=logger
)
location_ids = {}
focus_areas = ['italia_posti']
for focus_area in focus_areas:
location_ids[focus_area + '_admin'] = [-1, -1]
location_ids[focus_area + '_poly'] = [-1, -1]
location_ids[focus_area + '_line'] = [-1, -1]
location_ids[focus_area + '_point'] = [-1, -1]
# Create a connection with the database
database_handler = soton_corenlppy.PostgresqlHandler.PostgresqlHandler(
user='user',
passw='password',
hostname='hostname',
port=5432,
database='db'
)
# Load a set of previously preprocessed locations from database
cached_locations = geoparsepy.geo_preprocess_lib.cache_preprocessed_locations(
database_handle=database_handler,
location_ids=location_ids,
schema='public',
geospatial_config=geospatial_config
)
logger.info(f"Loaded {len(cached_locations)} position")
# Close connection with the database
database_handler.close()
# Compile an inverted index from a list of arbirary data where one column is a phrase string
indexed_locations = geoparsepy.geo_parse_lib.calc_inverted_index(
list_data=cached_locations,
dict_geospatial_config=geospatial_config
)
logger.info(f"Indexed {len(indexed_locations.keys())} phrases")
# Create an index of osmid to row indexes in the cached_locations
osmid_lookup = geoparsepy.geo_parse_lib.calc_osmid_lookup(cached_locations=cached_locations)
listText = [
u'Io sono di Sciacca, in provincia di agrigento',
u'Vengo dalla provincia di Agrigento, in Sicilia',
u'Mi sdraio sul prato del mio vicino',
u'Pavia e Ravenna sono belle città',
u'Voglio andare a new york',
u'Mi trovo a San Giuliano Terme',
u'Io sono di Sciacca, in provincia di Agrigento',
u'Martina vive a Nuoro ma vorrebbe andare ad Agrigento',
u'Agrigento è la provincia che contiene il comune di Sciacca',
u'Vicino san giuliano terme c\'è un comune che si chiama Asciano',
u'La città di Sciacca si trova in provincia di Agrigento',
u'Mi trovo a Sciacca'
]
listTokenSets = []
for text in listText:
# Tokenize a text entry into unigram tokens text will be cleaned and tokenize
listToken = soton_corenlppy.common_parse_lib.unigram_tokenize_text(
text=text,
dict_common_config=geospatial_config
)
listTokenSets.append(listToken)
# Geoparse token sets using a set of cached locations
listMatchSet = geoparsepy.geo_parse_lib.geoparse_token_set(
token_set=listTokenSets,
dict_inverted_index=indexed_locations,
dict_geospatial_config=geospatial_config
)
# Print the matched location
for i in range(len(listMatchSet)):
logger.info(f"\nText: {listText[i]}")
listMatch = listMatchSet[i]
for tupleMatch in listMatch:
logger.info(str(tupleMatch))
I get the following error:
<ipython-input-17-c67b3eda95f5> in <module>
34
35 # Load a set of previously preprocessed locations from database
---> 36 cached_locations = geoparsepy.geo_preprocess_lib.cache_preprocessed_locations(
37 database_handle=database_handler,
38 location_ids=location_ids,
c:\users\katarina\appdata\local\programs\python\python38\lib\site-packages\geoparsepy\geo_preprocess_lib.py in cache_preprocessed_locations(database_handle, location_ids, schema, geospatial_config, timeout_statement, timeout_overall, spatial_filter)
1647
1648 # note: SQL returns UTF8 encoded <str> objects. to get <unicode> use unicode( strText, 'utf8' )
-> 1649 listRows = database_handle.execute_sql_query_batch( listSQL, timeout_statement, timeout_overall )
1650
1651 listResult = []
c:\users\katarina\appdata\local\programs\python\python38\lib\site-packages\soton_corenlppy\PostgresqlHandler.py in execute_sql_query_batch(self, query_list, timeout_statement, timeout_overall)
346
347 # failure
--> 348 raise Exception( 'SQL query failed (timeout retrying) : ' + strLastError + ' : ' + tupleStatement[0] )
349
350 def execute_sql_statement( self, statement_list, timeout_statement = 60, timeout_overall = 180 ) :
Exception: SQL query failed (timeout retrying) : ['42883'] UndefinedFunction("function st_astext(geometry[]) does not exist\nLINE 1: ...osti_line_',loc_id),name,osm_id_set,admin_regions,ST_AsText(...\n ^\nHINT: No function matches the given name and argument types. You might need to add explicit type casts.\n") : SELECT concat('italia_posti_line_',loc_id),name,osm_id_set,admin_regions,ST_AsText(geom),hstore_to_matrix(tags) FROM public.italia_posti_line
I made a empty line and polygon tables, as they are present in the original documentation. I have two columns more in my point table and admin_regions is empty column, can it be a problem?
The database is complaining that you're calling the function ST_AsText
using an array of geometries ST_AsText(geometry[])
and it should be a single geometry ST_AsText(geometry)
Example:
SELECT ST_AsText('0101000000000000000000F03F0000000000000040');
st_astext
------------
POINT(1 2)