Postgresql tsvector type contains alphabetically sorted lexemes along with positional information. How to get sorted array of lexemes from tsvector, sorted by their positional information instead of alphabetical order?
There is no ready-to-use feature for this but you can write a custom function to get what you want, e.g.:
create or replace function reorder_tsvector(vector tsvector)
returns text[] language sql as $$
select array_agg(concat(pos, ':', ar[1]) order by pos::int)
from (
select string_to_array(elem, ':') ar
from unnest(string_to_array(vector::text, ' ')) elem
) s,
unnest(string_to_array(ar[2], ',')) pos
$$;
select to_tsvector('english', 'a fat cat sat on a mat - it ate a fat rats');
to_tsvector
-----------------------------------------------------
'ate':9 'cat':3 'fat':2,11 'mat':7 'rat':12 'sat':4
(1 row)
select reorder_tsvector(to_tsvector('english', 'a fat cat sat on a mat - it ate a fat rats'));
reorder_tsvector
-------------------------------------------------------------
{2:'fat',3:'cat',4:'sat',7:'mat',9:'ate',11:'fat',12:'rat'}
(1 row)
Update. My versions of the OP's function:
create or replace function tokenize_orig_1(p_string text, p_dictionary regconfig)
returns text[] language sql as $$
select array_agg(lexeme order by positions)
from unnest(to_tsvector($2, $1)) arr
$$;
select tokenize_orig_1('a fat cat sat on a mat - it ate a fat rats', 'English');
tokenize_orig_1
---------------------------
{fat,cat,sat,mat,ate,rat}
(1 row)
or with duplicates:
create or replace function tokenize_orig_2(p_string text, p_dictionary regconfig)
returns text[] language sql as $$
select array_agg(lexeme order by pos)
from unnest(to_tsvector($2, $1)) arr,
unnest(positions) pos
$$;
select tokenize_orig_2('a fat cat sat on a mat - it ate a fat rats', 'English');
tokenize_orig_2
-------------------------------
{fat,cat,sat,mat,ate,fat,rat}
(1 row)