Search code examples
twittertokenizestop-wordsrapidminertext-classification

Tokenize and stopword don't work in Tweets DB using RapidMiner


I would like tokenize and apply stop word filter on Twitter comments contained in a database, but Process Document does nothing. What am I doing wrong?

My goal is to apply these filters but keep the comments in rows instead of a single word vector.

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.015">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.3.015" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="read_database" compatibility="5.3.015" expanded="true" height="60" name="Server Connection (2)" width="90" x="45" y="30">
        <parameter key="connection" value="sqlserver2014"/>
        <parameter key="query" value="select top 60 tweetid,content from [Tweets General]"/>
        <enumeration key="parameters"/>
      </operator>
      <operator activated="true" class="text:data_to_documents" compatibility="5.3.002" expanded="true" height="60" name="Data to Documents" width="90" x="246" y="30">
        <parameter key="select_attributes_and_weights" value="true"/>
        <list key="specify_weights"/>
      </operator>
      <operator activated="true" class="text:process_documents" compatibility="5.3.002" expanded="true" height="94" name="Process Documents" width="90" x="447" y="30">
        <process expanded="true">
          <operator activated="true" class="text:tokenize" compatibility="5.3.002" expanded="true" height="60" name="Tokenize (3)" width="90" x="246" y="75"/>
          <connect from_port="document" to_op="Tokenize (3)" to_port="document"/>
          <connect from_op="Tokenize (3)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <connect from_op="Server Connection (2)" from_port="output" to_op="Data to Documents" to_port="example set"/>
      <connect from_op="Data to Documents" from_port="documents" to_op="Process Documents" to_port="documents 1"/>
      <connect from_op="Process Documents" from_port="example set" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>

Solution

  • You need to convert any attributes of type nominal to be of type text before the Data to Documents operator. The operator Nominal to Text will do this. You also need to set the option select attributes and weights to false in Data to Documents because I think the setting you have will deselect everything.