Search code examples
javahadoopmapreduce

Hadoop Word count: receive the total number of words that start with the letter "c"


Heres the Hadoop word count java map and reduce source code:

In the map function, I've gotten to where I can output all the word that starts with the letter "c" and also the total number of times that word appears, but what I'm trying to do is just output the total number of words starting with the letter "c" but I'm stuck a little on getting the total number.Any help would be greatly appreciated, Thank you.

Example

My Output of what I'm getting:

could 2

can 3

cat 5

What I'm trying to get:

c-total 10

public static class MapClass extends MapReduceBase
   implements Mapper<LongWritable, Text, Text, IntWritable> {

private final static IntWritable one = new IntWritable(1);
private Text word = new Text();

public void map(LongWritable key, Text value,
                OutputCollector<Text, IntWritable> output,
                Reporter reporter) throws IOException {
  String line = value.toString();
  StringTokenizer itr = new StringTokenizer(line);
  while (itr.hasMoreTokens()) {
    word.set(itr.nextToken());
    if(word.toString().startsWith("c"){
    output.collect(word, one);
   }
  }
 } 
}


public static class Reduce extends MapReduceBase
implements Reducer<Text, IntWritable, Text, IntWritable> {

public void reduce(Text key, Iterator<IntWritable> values,
                   OutputCollector<Text, IntWritable> output,
                   Reporter reporter) throws IOException {
  int sum = 0;
  while (values.hasNext()) {
    sum += values.next().get(); //gets the sum of the words and add them together
  }
  output.collect(key, new IntWritable(sum)); //outputs the word and the number
  }
 }

Solution

  • Chris Gerken 's answer is right.

    If you are outputing word as your key it will only help you to calculate the count of unique words starting with "c"

    Not all total count of "c".

    So for that you need to output a unique key from mapper.

     while (itr.hasMoreTokens()) {
                String token = itr.nextToken();
                if(token.startsWith("c")){
                    word.set("C_Count");
                    output.collect(word, one);
                }
    
            }
    

    Here is an example using New Api

    Driver class

    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.FileSystem;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
    
    public class WordCount {
    
        public static void main(String[] args) throws Exception {
            Configuration conf = new Configuration();
    
            Job job = new Job(conf, "wordcount");
            FileSystem fs = FileSystem.get(conf);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(IntWritable.class);
            if (fs.exists(new Path(args[1])))
                fs.delete(new Path(args[1]), true);
            job.setMapperClass(Map.class);
            job.setReducerClass(Reduce.class);
    
            job.setInputFormatClass(TextInputFormat.class);
            job.setOutputFormatClass(TextOutputFormat.class);
    
            FileInputFormat.addInputPath(job, new Path(args[0]));
            FileOutputFormat.setOutputPath(job, new Path(args[1]));
            job.setJarByClass(WordCount.class);     
            job.waitForCompletion(true);
        }
    
    }
    

    Mapper class

    import java.io.IOException;
    import java.util.StringTokenizer;
    
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Mapper;
    
    public class Map extends Mapper<LongWritable, Text, Text, IntWritable> {
        private final static IntWritable one = new IntWritable(1);
        private Text word = new Text();
    
        public void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {
            String line = value.toString();
            StringTokenizer itr = new StringTokenizer(line);
            while (itr.hasMoreTokens()) {
                String token = itr.nextToken();
                if(token.startsWith("c")){
                    word.set("C_Count");
                    context.write(word, one);
                }
    
            }
        }
    }
    

    Reducer class

    import java.io.IOException;
    
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Reducer;
    
    public class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> {
    
        public void reduce(Text key, Iterable<IntWritable> values, Context context)
                throws IOException, InterruptedException {
            int sum = 0;
            for (IntWritable val : values) {
                sum += val.get();
            }
            context.write(key, new IntWritable(sum));
        }
    }