Search code examples
solrsql-server-2019dataimporthandleriif-function

Solr data-config seems to ignore my entity query to populate field


I have a database with table [objs] with a column city. I want to use that value OR if a match in table cities is found that value instead.

When I run this on my database (it has just 5 records for now) it returns the desired output:

SELECT IIF(citygeonameid=0,city,c.name) as city FROM objs o 
LEFT JOIN cities c on c.geonameid=o.citygeonameid

In my solr data config I want to index the value from objs.city or if a match is found in cities the value from there and store it in a field city. I want to use the same name since my entire application is already using this field and I don't want to rename it everywhere. Also objs.city serves another purpose in my application logic.

I tried these two options in my data-config, but both solutions still show the objs.city value from the database rather than the normalizedgeo.city value from my data-config. I commented out the original field name, but still that value shows. I've tried restarting Solr, reloading the data-config, and have done a succesful full import each time to test. All to no avail. How is this possible?

data-config.xml

<dataConfig>
<dataSource driver="com.microsoft.sqlserver.jdbc.SQLServerDriver" url="jdbc:sqlserver://localhost:1433;databaseName=cars" user="XXX" password="XXXXX" />
    <document name="objs">
        <entity pk="id" name="obj" query="SELECT *
        ,LOWER(city) AS city_lower
        ,(LOWER(city)+'|'+CAST(provinceid as nvarchar)) AS city_provinceid 
        FROM objs WHERE deleted_date = '1/1/1900'"
        deletedPkQuery="SELECT id
                         FROM objs 
                         WHERE deleted_date &gt; '1/1/1900'"
        deltaQuery="SELECT id FROM objs WHERE updatedate &gt; '${dataimporter.last_index_time}'"
        deltaImportQuery="SELECT *
        ,LOWER(city) AS city_lower
        ,(LOWER(city)+'|'+CAST(provinceid as nvarchar)) AS city_provinceid 
        FROM objs WHERE id=${dataimporter.delta.id}"
        >
            <field name="userid" column="userid" />
            <field name="title" column="title" />
            <field name="type" column="type" />
            
            <field name="street" column="street" />
            <!--<field name="city" column="city" />-->
            
            <entity name="normalizedgeo" query="SELECT IIF(citygeonameid=0,city,gc.name) as city FROM objs l
                LEFT JOIN cities gc on gc.geonameid=l.citygeonameid
                WHERE l.id=${obj.id}">
                <field name="city" column="city" />
            </entity>           
            
            <field name="citygeonameid" column="citygeonameid" />

            
        </entity>
    </document>
</dataConfig>

Two configurations I tried for the normalized geo elements:

<!--<field name="city" column="city" />-->          
<entity name="normalizedgeo" query="SELECT IIF(citygeonameid=0,city,c.name) as city FROM objs o
    LEFT JOIN cities c on c.geonameid=o.citygeonameid
    WHERE o.id=${obj.id}">
    <field name="city" column="city" />
</entity>           

<!--<field name="city" column="city" />-->          
<entity name="normalizedgeo" query="SELECT IIF(citygeonameid=0,city,c.name) as city FROM objs o
    LEFT JOIN cities c on c.geonameid=o.citygeonameid
    WHERE o.id=${obj.id}">
    <!--<field name="city" column="city" />-->
</entity>           

schema.xml

<field name="city" type="string" indexed="true" stored="true"/> 

solrconfig.xml

<lib dir="${solr.install.dir:../../../..}/dist/" regex="solr-dataimporthandler-.*\.jar" />

<requestHandler name="/dataimport" class="org.apache.solr.handler.dataimport.DataImportHandler">
  <lst name="defaults">
    <str name="config">data-config.xml</str>
  </lst>
</requestHandler>

Database script

SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
CREATE TABLE [dbo].[objs](
    [id] [int] IDENTITY(1,1) NOT NULL,
    [city] [nvarchar](150) NULL,
    [citygeonameid] [int] NOT NULL
)
GO
SET IDENTITY_INSERT [dbo].[objs] ON 
GO
INSERT [dbo].[objs] ([id], [city], [citygeonameid]) VALUES (17284118, N'NEW YORK CITY', 2759794)
GO
INSERT [dbo].[objs] ([id], [city], [citygeonameid]) VALUES (17284117, N'NEW YORK CITY', 2759794)
GO
INSERT [dbo].[objs] ([id], [city], [citygeonameid]) VALUES (17284116, N'NEW YORK CITY', 2759794)
GO
INSERT [dbo].[objs] ([id], [city], [citygeonameid]) VALUES (17284120, N'BOSTON', 0)
GO
SET IDENTITY_INSERT [dbo].[objs] OFF
GO
ALTER TABLE [dbo].[objs] ADD  CONSTRAINT [PK_obj] PRIMARY KEY NONCLUSTERED 
(
    [id] ASC
)WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, SORT_IN_TEMPDB = OFF, IGNORE_DUP_KEY = OFF, ONLINE = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON, OPTIMIZE_FOR_SEQUENTIAL_KEY = OFF) ON [PRIMARY]
GO

GO
CREATE TABLE [dbo].[cities](
    [id] [int] IDENTITY(1,1) NOT NULL,
    [geonameid] [float] NULL,
    [name] [nvarchar](255) NOT NULL
)
GO
INSERT [dbo].[cities] ([id], [geonameid], [name]) VALUES (9, 2759794, N'New York City')

What I would expect as output of my Solr query (which works correctly with regular SQL in my SQL Server database) is:

New York City
New York City
New York City
BOSTON

But Solr shows: NEW YORK CITY NEW YORK CITY NEW YORK CITY BOSTON

I use Solr 8.10.1. Upon dataimport?command=full-import the data is indexed, but no errors or warnings are logged to http://localhost:8983/solr/#/~logging


Solution

  • Instead of configuring complex sub-queries in the data import handler you should better create a database view that exactly contains the data that should be passed to solr, e.g.:

    create view [dbo].[v_objs] as
    SELECT
      o.id,
      o.user_id,
      o.title,
      o.type,
      o.street,
      LOWER(city) AS city_lower,
      LOWER(city) + '|' + CAST(provinceid as nvarchar) AS city_provinceid,
      IIF(citygeonameid=0, city, c.name) AS city,
      o.citygeonameid,
      o.deleted_date,
      o.updatedate 
    FROM objs o LEFT JOIN cities c on c.geonameid=o.citygeonameid
    

    In the data-config you can treat the view like a normal table, e.g.:

    <dataConfig>
    <dataSource driver="com.microsoft.sqlserver.jdbc.SQLServerDriver" url="jdbc:sqlserver://localhost:1433;databaseName=cars" user="XXX" password="XXXXX" />
      <document name="objs">
        <entity
          pk="id" name="obj"
          query="SELECT * FROM v_objs WHERE deleted_date = '1/1/1900'"
          deletedPkQuery="SELECT id FROM v_objs WHERE deleted_date &gt; '1/1/1900'"
          deltaQuery="SELECT id FROM v_objs WHERE updatedate &gt; '${dataimporter.last_index_time}'"
          deltaImportQuery="SELECT * FROM v_objs WHERE id=${dataimporter.delta.id}"
        >
          <field name="userid" column="userid" />
          <field name="title" column="title" />
          <field name="type" column="type" />
          <field name="street" column="street" />
          <field name="city" column="city" />
          <field name="citygeonameid" column="citygeonameid" />
        </entity>
      </document>
    </dataConfig>
    

    This has several advantages:

    1. The database application can define and control its own interface to the solr engine.
    2. The database application can hide details about the database stucture.
    3. The database application can easily control which data will be shared with solr.
    4. The solr config doesn't need to know and work around db-specific details of the source application.

    Both, the source application and the solr engine will be easier to maintain, because they have a defined interface.