Search code examples
rapidminer

Sorting files into folders using rapidminer based on a classifier results


I need a help to sort the files that rapidminer classified into the labels (folders), is this task possible in rapidminer or java code that reads the resulting example set? this is the resulting table

ie: I just want to split files into folders represent its labels

this is the sample example set

    Data: SimpleExampleSet: 15 examples, 31988 regular attributes, 
special attributes = { label = #0: label (polynominal/single_value)/values=[test1] 
metadata_file = #1: metadata_file (polynominal/single_value)/values=[0.txt, 1.txt, 10.txt, 11.txt, 12.txt, 13.txt, 14.txt, 2.txt, 3.txt, 4.txt, 5.txt, 6.txt, 7.txt, 8.txt, 9.txt] 
metadata_path = #2: metadata_path (polynominal/single_value)/values=[D:\Finaltests\test1\0.txt, D:\Finaltests\test1\1.txt, D:\Finaltests\test1\10.txt, D:\Finaltests\test1\11.txt, D:\Finaltests\test1\12.txt, D:\Finaltests\test1\13.txt, D:\Finaltests\test1\14.txt, D:\Finaltests\test1\2.txt, D:\Finaltests\test1\3.txt, D:\Finaltests\test1\4.txt, D:\Finaltests\test1\5.txt, D:\Finaltests\test1\6.txt, D:\Finaltests\test1\7.txt, D:\Finaltests\test1\8.txt, D:\Finaltests\test1\9.txt] 
metadata_date = #3: metadata_date (date_time/single_value) 
confidence_sport = #31993: confidence(sport) (real/single_value) 
confidence_places = #31994: confidence(places) (real/single_value) 
prediction = #31992: prediction(label) (binominal/single_value) }

thank you.


Solution

  • It's a slightly more complicated process than I said initially so I've included an example below. It assumes Linux and will copy any files from /tmp/old to /tmp/new/A or /tmp/new/B. A and B is determined by the label.

    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="7.0.001">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="7.0.001" expanded="true" name="Process">
        <process expanded="true">
          <operator activated="true" class="loop_files" compatibility="7.0.001" expanded="true" height="82" name="Loop Files" width="90" x="45" y="34">
        <parameter key="directory" value="/tmp/old"/>
        <process expanded="true">
          <operator activated="true" class="provide_macro_as_log_value" compatibility="7.0.001" expanded="true" height="82" name="parent_path" width="90" x="179" y="34">
            <parameter key="macro_name" value="parent_path"/>
          </operator>
          <operator activated="true" class="provide_macro_as_log_value" compatibility="7.0.001" expanded="true" height="82" name="file_name" width="90" x="179" y="136">
            <parameter key="macro_name" value="file_name"/>
          </operator>
          <operator activated="true" class="log" compatibility="7.0.001" expanded="true" height="82" name="Log" width="90" x="380" y="34">
            <list key="log">
              <parameter key="parent_path" value="operator.parent_path.value.macro_value"/>
              <parameter key="file_name" value="operator.file_name.value.macro_value"/>
            </list>
          </operator>
          <connect from_op="parent_path" from_port="through 1" to_op="file_name" to_port="through 1"/>
          <connect from_op="file_name" from_port="through 1" to_op="Log" to_port="through 1"/>
          <connect from_op="Log" from_port="through 1" to_port="out 1"/>
          <portSpacing port="source_file object" spacing="0"/>
          <portSpacing port="source_in 1" spacing="0"/>
          <portSpacing port="sink_out 1" spacing="0"/>
          <portSpacing port="sink_out 2" spacing="0"/>
        </process>
          </operator>
          <operator activated="true" class="log_to_data" compatibility="7.0.001" expanded="true" height="103" name="Log to Data" width="90" x="179" y="34"/>
          <operator activated="true" class="generate_attributes" compatibility="7.0.001" expanded="true" height="82" name="Generate Attributes (2)" width="90" x="313" y="34">
        <list key="function_descriptions">
          <parameter key="label" value="if(rand()&gt;0.5, &quot;A&quot;, &quot;B&quot;)"/>
        </list>
          </operator>
          <operator activated="true" class="generate_attributes" compatibility="7.0.001" expanded="true" height="82" name="Generate Attributes" width="90" x="313" y="187">
        <list key="function_descriptions">
          <parameter key="old" value="parent_path + &quot;/&quot; + file_name"/>
          <parameter key="new" value="&quot;/tmp/new/&quot; + label+ &quot;/&quot; + file_name"/>
        </list>
          </operator>
          <operator activated="true" class="loop_examples" compatibility="7.0.001" expanded="true" height="82" name="Loop Examples" width="90" x="514" y="187">
        <process expanded="true">
          <operator activated="true" class="extract_macro" compatibility="7.0.001" expanded="true" height="68" name="old" width="90" x="112" y="34">
            <parameter key="macro" value="old"/>
            <parameter key="macro_type" value="data_value"/>
            <parameter key="attribute_name" value="old"/>
            <parameter key="example_index" value="%{example}"/>
            <list key="additional_macros"/>
          </operator>
          <operator activated="true" class="extract_macro" compatibility="7.0.001" expanded="true" height="68" name="new" width="90" x="112" y="136">
            <parameter key="macro" value="new"/>
            <parameter key="macro_type" value="data_value"/>
            <parameter key="attribute_name" value="new"/>
            <parameter key="example_index" value="%{example}"/>
            <list key="additional_macros"/>
          </operator>
          <operator activated="true" class="copy_file" compatibility="7.0.001" expanded="true" height="82" name="Copy File" width="90" x="380" y="34">
            <parameter key="source_file" value="%{old}"/>
            <parameter key="new_file" value="%{new}"/>
          </operator>
          <connect from_port="example set" to_op="old" to_port="example set"/>
          <connect from_op="old" from_port="example set" to_op="new" to_port="example set"/>
          <connect from_op="new" from_port="example set" to_op="Copy File" to_port="through 1"/>
          <connect from_op="Copy File" from_port="through 1" to_port="example set"/>
          <portSpacing port="source_example set" spacing="0"/>
          <portSpacing port="sink_example set" spacing="0"/>
          <portSpacing port="sink_output 1" spacing="0"/>
        </process>
          </operator>
          <connect from_op="Loop Files" from_port="out 1" to_op="Log to Data" to_port="through 1"/>
          <connect from_op="Log to Data" from_port="exampleSet" to_op="Generate Attributes (2)" to_port="example set input"/>
          <connect from_op="Generate Attributes (2)" from_port="example set output" to_op="Generate Attributes" to_port="example set input"/>
          <connect from_op="Generate Attributes" from_port="example set output" to_op="Loop Examples" to_port="example set"/>
          <connect from_op="Loop Examples" from_port="example set" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>
    

    I used Copy File to avoid moving files around and causing damage, hopefully you can see how it works.

    In summary, if you make attributes with old and new names, you have to use Loop Examples to go through each example. Inside this loop operator, you have to extract the values to pass to the Copy File operator as macros.

    Hope that helps as a basis.