MapReduce WordCount程序 - 输出与输入文件相同

时间:2014-11-03 09:04:26

标签: hadoop mapreduce

我期望的输出是输入文件中每个单词的计数。但我的输出是整个输入文件,因为它是。 我使用extends Mapper<LongWritable, Text, Text, IntWritable>用于mapper类,Reducer<Text, IntWritable, Text, IntWritable>用于reducer类。 这是我的代码

driver.java

&#13;
&#13;
public class driver extends Configured implements Tool{
     
     public int run(String[] args) throws Exception
       {
        Configuration conf = new Configuration();
              Job job = new Job(conf, "wordcount");
              
              job.setMapperClass(mapper.class);
              job.setReducerClass(reducer.class);
              
              job.setOutputKeyClass(Text.class);
              job.setOutputValueClass(Text.class);
              job.setInputFormatClass(KeyValueTextInputFormat.class);
              
              FileInputFormat.addInputPath(job, new Path(args[0]));
              FileOutputFormat.setOutputPath(job, new Path(args[1]));
              
              job.waitForCompletion(true);
              //JobClient.runJob((JobConf) conf);
              //System.exit(job.waitForCompletion(true) ? 0 : 1);
             return 0;
       }
     
      public static void main(String[] args) throws Exception
      {
          long start = System.currentTimeMillis();
            //int res = ToolRunner.run(new Configuration(), new driver(),args);
          
           int res = ToolRunner.run(new Configuration(), new driver(),args);
            
            long stop = System.currentTimeMillis();
            System.out.println ("Time: " + (stop-start));
            System.exit(res);
      }
}
&#13;
&#13;
&#13;

mapper.java

&#13;
&#13;
public class mapper extends Mapper<LongWritable, Text, Text, IntWritable>
{
      //hadoop supported data types
      private final static IntWritable one = new IntWritable(1);
      private Text word = new Text();
     
      //map method that performs the tokenizer job and framing the initial key value pairs
      public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException
      {
            String line = value.toString();
          StringTokenizer tokenizer = new StringTokenizer(line);

            while (tokenizer.hasMoreTokens())
            {
               word.set(tokenizer.nextToken());
                 output.collect(word, one);
            }
       }
}
&#13;
&#13;
&#13;
reducer.java
&#13;
&#13;
public class reducer extends Reducer<Text, IntWritable, Text, IntWritable>
{
      //reduce method accepts the Key Value pairs from mappers, do the aggregation based on keys and produce the final out put
      public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException
      {
            int sum = 0;
          while (values.hasNext())
          {
               sum += values.next().get();
          }
          output.collect(key, new IntWritable(sum));
      }
}
&#13;
&#13;
&#13;

3 个答案:

答案 0 :(得分:1)

你被新的&amp; MapReduce的旧API。我认为你试图在新的API中编写WordCount程序,但是从旧的API(也许是旧的博客文章)中获取了片段。如果您只是在地图和地图上添加@override注释,您就可以自己找到问题。减少方法。

看看他们在进化后会发生什么:

您刚刚编写了两个指定较旧签名的新方法,因此它们不会覆盖任何内容,无处调用。代码没有做任何事情,因为被调用的实际方法有空体(我不认为有一个默认实现,如果只有身份操作)。

无论如何,您应该遵循编码的基本约定。

答案 1 :(得分:0)

如果您的代码遇到问题,请运行此代码。此代码包含mapper,reducer和main函数。

import java.io.IOException;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;    
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;

public class WordCount {

  public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> {
    private final static IntWritable one = new IntWritable(1);
    private Text word = new Text();

    public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
       String line = value.toString();
       StringTokenizer tokenizer = new StringTokenizer(line);

       while (tokenizer.hasMoreTokens()) {
              word.set(tokenizer.nextToken());
              output.collect(word, one);
       }
   }
}

public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> {

   public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {

       int sum = 0;     
       while (values.hasNext()){
          sum += values.next().get();
       }
      output.collect(key, new IntWritable(sum)); 
    }
}

public static void main(String[] args) throws Exception {
     JobConf conf = new JobConf(WordCount.class);
     conf.setJobName("wordcount");
     conf.setOutputKeyClass(Text.class);
     conf.setOutputValueClass(IntWritable.class);
     conf.setMapperClass(Map.class);
     conf.setCombinerClass(Reduce.class); 
     conf.setReducerClass(Reduce.class);
     conf.setInputFormat(TextInputFormat.class); 
     conf.setOutputFormat(TextOutputFormat.class);

     FileInputFormat.setInputPaths(conf, new Path(args[0])); 
     FileOutputFormat.setOutputPath(conf, new Path(args[1]));

     JobClient.runJob(conf);
  }
}

2)之后创建一个这段代码的jar,说wordcount.jar保存在你的主目录(/home/user/wordcount.jar)中并运行以下命令:

hadoop jar wordcount.jar classname /inputfile /outputfile /

这将在hadoop的/(root)目录下创建一个文件输出文件。通过

查看结果
hadoop dfs -cat /outputfile/part-m-00000

这将成功运行您的wordcount程序。

答案 2 :(得分:0)

试试这个,

import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;


public class WordCount  {

    public static class Map extends MapReduceBase implements
            Mapper<LongWritable, Text, Text, IntWritable> {

        @Override
        public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter)
                throws IOException {

            String line = value.toString();
            StringTokenizer tokenizer = new StringTokenizer(line);
            System.out.println(line);
            while (tokenizer.hasMoreTokens()) {
                value.set(tokenizer.nextToken());
                output.collect(value, new IntWritable(1));
            }

        }
    }

    public static class Reduce extends MapReduceBase implements
            Reducer<Text, IntWritable, Text, IntWritable> {

        @Override
        public void reduce(Text key, Iterator<IntWritable> values,
                OutputCollector<Text, IntWritable> output, Reporter reporter)
                throws IOException {
            int sum = 0;
            while (values.hasNext()) {
                sum += values.next().get();
            }

            output.collect(key, new IntWritable(sum));
        }
    }

    public static void main(String[] args) throws Exception,IOException  {

        JobConf conf = new JobConf(WordCount.class);
        conf.setJobName("WordCount");

        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(IntWritable.class);

        conf.setMapperClass(Map.class);
        conf.setReducerClass(Reduce.class);

        conf.setInputFormat(TextInputFormat.class);
        conf.setOutputFormat(TextOutputFormat.class);

        FileInputFormat.setInputPaths(conf, new Path("/home/user17/test.txt"));
        FileOutputFormat.setOutputPath(conf, new Path("hdfs://localhost:9000/out2"));

        JobClient.runJob(conf);

    }
}

制作jar并在commandLine上执行给定的命令

hadoop jar WordCount.jar WordCount /inputfile /outputfile