Transform flair language model tensors for viewing in TensorBoard Projector

I want to convert "vectors,"

vectors = [token.embedding for token in sentence]
print(type(vectors))
<class 'list'>

print(vectors)
[tensor([ 0.0077, -0.0227, -0.0004,  ...,  0.1377, -0.0003,  0.0028]),
...
tensor([ 0.0003, -0.0461,  0.0043,  ..., -0.0126, -0.0004,  0.0142])]

0.0077 -0.0227 -0.0004 ... 0.1377 -0.0003 0.0028
...
0.0003 -0.0461 0.0043 ... -0.0126 -0.0004 0.0142

and write that to a TSV.

Aside: those embeddings are from flair (https://github.com/zalandoresearch/flair): how can I get the full output, not the -0.0004 ... 0.1377 abbreviated output?

Solution

OK, I dug around ...

It turns out those are PyTorch tensors (Flair uses PyTorch). For a simple conversion to NumPy arrays (per the PyTorch docs at https://pytorch.org/docs/stable/tensors.html#torch.Tensor.tolist and this StackOverFlow answer use tolist(), a PyTorch method.
```
>>> import torch
>>> a = torch.randn(2, 2)
>>> print(a)
tensor([[-2.1693,  0.7698],
        [ 0.0497,  0.8462]])

>>> a.tolist()
[[-2.1692984104156494, 0.7698001265525818],
 [0.049718063324689865, 0.8462421298027039]]
```

Per my original question, here's how to convert those data to plain text and write them to TSV files.

from flair.embeddings import FlairEmbeddings, Sentence
from flair.models import SequenceTagger
from flair.embeddings import StackedEmbeddings

embeddings_f = FlairEmbeddings('pubmed-forward')
embeddings_b = FlairEmbeddings('pubmed-backward')

sentence = Sentence('The RAS-MAPK signalling cascade serves as a central node in transducing signals from membrane receptors to the nucleus.')

tagger = SequenceTagger.load('ner')

tagger.predict(sentence)
embeddings_f.embed(sentence)

stacked_embeddings = StackedEmbeddings([
    embeddings_f,
    embeddings_b,
])

stacked_embeddings.embed(sentence)

# for token in sentence:
#     print(token)
#     print(token.embedding)
#     print(token.embedding.shape)

tokens = [token for token in sentence]
print(tokens)
'''
  [Token: 1 The, Token: 2 RAS-MAPK, Token: 3 signalling, Token: 4 cascade, Token: 5 serves, Token: 6 as, Token: 7 a, Token: 8 central, Token: 9 node, Token: 10 in, Token: 11 transducing, Token: 12 signals, Token: 13 from, Token: 14 membrane, Token: 15 receptors, Token: 16 to, Token: 17 the, Token: 18 nucleus.]
'''

## https://www.geeksforgeeks.org/python-string-split/

tokens = [str(token).split()[2] for token in sentence]
print(tokens)
'''
  ['The', 'RAS-MAPK', 'signalling', 'cascade', 'serves', 'as', 'a', 'central', 'node', 'in', 'transducing', 'signals', 'from', 'membrane', 'receptors', 'to', 'the', 'nucleus.']
'''

tensors = [token.embedding for token in sentence]
print(tensors)
'''
  [tensor([ 0.0077, -0.0227, -0.0004,  ...,  0.1377, -0.0003,  0.0028]),
  tensor([-0.0007, -0.1601, -0.0274,  ...,  0.1982,  0.0013,  0.0042]),
  tensor([ 4.2534e-03, -3.1018e-01, -3.9660e-01,  ...,  5.9336e-02, -9.4445e-05,  1.0025e-02]),
  tensor([ 0.0026, -0.0087, -0.1398,  ..., -0.0037,  0.0012,  0.0274]),
  tensor([-0.0005, -0.0164, -0.0233,  ..., -0.0013,  0.0039,  0.0004]),
  tensor([ 3.8261e-03, -7.6409e-02, -1.8632e-02,  ..., -2.8906e-03, -4.4556e-04,  5.6909e-05]),
  tensor([ 0.0035, -0.0207,  0.1700,  ..., -0.0193,  0.0017,  0.0006]),
  tensor([ 0.0159, -0.4097, -0.0489,  ...,  0.0743,  0.0005,  0.0012]),
  tensor([ 9.7725e-03, -3.3817e-01, -2.2848e-02,  ..., -6.6284e-02, 2.3646e-04,  1.0505e-02]),
  tensor([ 0.0219, -0.0677, -0.0154,  ...,  0.0102,  0.0066,  0.0016]),
  tensor([ 0.0092, -0.0431, -0.0450,  ...,  0.0060,  0.0002,  0.0005]),
  tensor([ 0.0047, -0.2732, -0.0408,  ...,  0.0136,  0.0005,  0.0072]),
  tensor([ 0.0072, -0.0173, -0.0149,  ..., -0.0013, -0.0004,  0.0056]),
  tensor([ 0.0086, -0.1151, -0.0629,  ...,  0.0043,  0.0050,  0.0016]),
  tensor([ 7.6452e-03, -2.3825e-01, -1.5683e-02,  ..., -5.4974e-04, -1.4646e-04,  6.6120e-03]),
  tensor([ 0.0038, -0.0354, -0.1337,  ...,  0.0060, -0.0004,  0.0102]),
  tensor([ 0.0186, -0.0151, -0.0641,  ...,  0.0188,  0.0391,  0.0069]),
  tensor([ 0.0003, -0.0461,  0.0043,  ..., -0.0126, -0.0004,  0.0142])]
'''

# ----------------------------------------
## Write those data to TSV files.

## https://stackoverflow.com/a/29896136/1904943

import csv

metadata_f = 'metadata.tsv'
tensors_f = 'tensors.tsv'

with open(metadata_f, 'w', encoding='utf8', newline='') as tsv_file:
    tsv_writer = csv.writer(tsv_file, delimiter='\t', lineterminator='\n')
    for token in tokens:
        ## Assign to a dummy variable ( _ ) to suppress character counts;
        ## if I use (token), rather than ([token]), I get spaces between all characters:
        _ = tsv_writer.writerow([token])

## metadata.tsv :
'''
  The
  RAS-MAPK
  signalling
  cascade
  serves
  as
  a
  central
  node
  in
  transducing
  signals
  from
  membrane
  receptors
  to
  the
  nucleus.
'''

with open(metadata_f, 'w', encoding='utf8', newline='') as tsv_file:
    tsv_writer = csv.writer(tsv_file, delimiter='\t', lineterminator='\n')
    _ = tsv_writer.writerow(tokens)

## metadata.tsv :
'''
  The   RAS-MAPK    signalling  cascade serves  as  a   central node    in  transducing signals from    membrane    receptors   to  the nucleus.
'''

tensors = [token.embedding for token in sentence]
print(tensors)
'''
  [tensor([ 0.0077, -0.0227, -0.0004,  ...,  0.1377, -0.0003,  0.0028]),
  tensor([-0.0007, -0.1601, -0.0274,  ...,  0.1982,  0.0013,  0.0042]),
  tensor([ 4.2534e-03, -3.1018e-01, -3.9660e-01,  ...,  5.9336e-02, -9.4445e-05,  1.0025e-02]),
  tensor([ 0.0026, -0.0087, -0.1398,  ..., -0.0037,  0.0012,  0.0274]),
  tensor([-0.0005, -0.0164, -0.0233,  ..., -0.0013,  0.0039,  0.0004]),
  tensor([ 3.8261e-03, -7.6409e-02, -1.8632e-02,  ..., -2.8906e-03, -4.4556e-04,  5.6909e-05]),
  tensor([ 0.0035, -0.0207,  0.1700,  ..., -0.0193,  0.0017,  0.0006]),
  tensor([ 0.0159, -0.4097, -0.0489,  ...,  0.0743,  0.0005,  0.0012]),
  tensor([ 9.7725e-03, -3.3817e-01, -2.2848e-02,  ..., -6.6284e-02, 2.3646e-04,  1.0505e-02]),
  tensor([ 0.0219, -0.0677, -0.0154,  ...,  0.0102,  0.0066,  0.0016]),
  tensor([ 0.0092, -0.0431, -0.0450,  ...,  0.0060,  0.0002,  0.0005]),
  tensor([ 0.0047, -0.2732, -0.0408,  ...,  0.0136,  0.0005,  0.0072]),
  tensor([ 0.0072, -0.0173, -0.0149,  ..., -0.0013, -0.0004,  0.0056]),
  tensor([ 0.0086, -0.1151, -0.0629,  ...,  0.0043,  0.0050,  0.0016]),
  tensor([ 7.6452e-03, -2.3825e-01, -1.5683e-02,  ..., -5.4974e-04, -1.4646e-04,  6.6120e-03]),
  tensor([ 0.0038, -0.0354, -0.1337,  ...,  0.0060, -0.0004,  0.0102]),
  tensor([ 0.0186, -0.0151, -0.0641,  ...,  0.0188,  0.0391,  0.0069]),
  tensor([ 0.0003, -0.0461,  0.0043,  ..., -0.0126, -0.0004,  0.0142])]
'''

with open(tensors_f, 'w', encoding='utf8', newline='') as tsv_file:
    tsv_writer = csv.writer(tsv_file, delimiter='\t', lineterminator='\n')
    for token in sentence:
        embedding = token.embedding
        _ = tsv_writer.writerow(embedding.tolist())

## tensors.tsv (18 lines: one embedding per token in metadata.tsv):
## note: enormous output, even for this simple sentence.
'''
  0.007691788021475077  -0.02268664352595806    -0.0004340760060586035  ...
'''

Last, my intention for all of that was to load contextual language embeddings (Flair, etc.) into TensorFlow's Embedding Projector. It turns out all I needed to to was to convert (here, Flair data) to NumPy arrays, and load them into a TensorFlow TensorBoard instance (no need for TSV files!).

I describe that in detail in my blog post, here: Visualizing Language Model Tensors (Embeddings) in TensorFlow's TensorBoard [TensorBoard Projector: PCA; t-SNE; ...].