I am trying to generate summary of long PDF. So, what I did, first I converted my pdf to text using pdfminer.six
library. Next, I used 2 functions which were provided in a discuss here.
The code:
bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
bart_model = BartModel.from_pretrained("facebook/bart-large", return_dict=True)
# generate chunks of text \ sentences <= 1024 tokens
def nest_sentences(document):
nested = []
sent = []
length = 0
for sentence in nltk.sent_tokenize(document):
length += len(sentence)
if length < 1024:
sent.append(sentence)
else:
nested.append(sent)
sent = [sentence]
length = len(sentence)
if sent:
nested.append(sent)
return nested
# generate summary on text with <= 1024 tokens
def generate_summary(nested_sentences):
device = 'cuda'
summaries = []
for nested in nested_sentences:
input_tokenized = bart_tokenizer.encode(' '.join(nested), truncation=True, return_tensors='pt')
input_tokenized = input_tokenized.to(device)
summary_ids = bart_model.to(device).generate(
input_tokenized,
length_penalty=3.0,
min_length=30,
max_length=100,
)
output = [bart_tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]
summaries.append(output)
summaries = [sentence for sublist in summaries for sentence in sublist]
return summaries
Then, to get the summary, I do:
nested_sentences = nest_sentences(text)
Where, text
is a text of string having length around 10K which I converted using pdf library.
summary = generate_summary(nested_sentences)
Then, I get the following error:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-15-d5aa7709bb5f> in <module>()
----> 1 summary = generate_summary(nested_sentences)
3 frames
<ipython-input-11-8554509269e0> in generate_summary(nested_sentences)
28 length_penalty=3.0,
29 min_length=30,
---> 30 max_length=100,
31 )
32 output = [bart_tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]
/usr/local/lib/python3.7/dist-packages/torch/autograd/grad_mode.py in decorate_context(*args, **kwargs)
26 def decorate_context(*args, **kwargs):
27 with self.__class__():
---> 28 return func(*args, **kwargs)
29 return cast(F, decorate_context)
30
/usr/local/lib/python3.7/dist-packages/transformers/generation_utils.py in generate(self, input_ids, max_length, min_length, do_sample, early_stopping, num_beams, temperature, top_k, top_p, repetition_penalty, bad_words_ids, bos_token_id, pad_token_id, eos_token_id, length_penalty, no_repeat_ngram_size, encoder_no_repeat_ngram_size, num_return_sequences, max_time, max_new_tokens, decoder_start_token_id, use_cache, num_beam_groups, diversity_penalty, prefix_allowed_tokens_fn, output_attentions, output_hidden_states, output_scores, return_dict_in_generate, forced_bos_token_id, forced_eos_token_id, remove_invalid_values, synced_gpus, **model_kwargs)
1061 return_dict_in_generate=return_dict_in_generate,
1062 synced_gpus=synced_gpus,
-> 1063 **model_kwargs,
1064 )
1065
/usr/local/lib/python3.7/dist-packages/transformers/generation_utils.py in beam_search(self, input_ids, beam_scorer, logits_processor, stopping_criteria, max_length, pad_token_id, eos_token_id, output_attentions, output_hidden_states, output_scores, return_dict_in_generate, synced_gpus, **model_kwargs)
1799 continue # don't waste resources running the code we don't need
1800
-> 1801 next_token_logits = outputs.logits[:, -1, :]
1802
1803 # hack: adjust tokens for Marian. For Marian we have to make sure that the `pad_token_id`
AttributeError: 'Seq2SeqModelOutput' object has no attribute 'logits'
I cannot find anything related to this error, so I would really appreciate it if anyone could help or is there any better approach to generate summary for long texts?
Thank you in advance!
The issue here is the BartModel line. Switch this for a BartForConditionalGeneration class and the problem will be solved. In essence the generation utilities assume that it is a model that can be used for language generation, and in this case the BartModel is just the base without the LM head.