Question

我从Mapreduce代码获得了奇怪的输出：

输入：

aa bb  
aa cc  
bb aa  
cc dd  
dd bb  
xx aa  
ss rr

输出

aa  org.mapreduce.userscore.UserScore$ScoreWritable@1  
aa  org.mapreduce.userscore.UserScore$ScoreWritable@0  
aa  org.mapreduce.userscore.UserScore$ScoreWritable@1  
aa  org.mapreduce.userscore.UserScore$ScoreWritable@0  
bb  org.mapreduce.userscore.UserScore$ScoreWritable@0  
bb  org.mapreduce.userscore.UserScore$ScoreWritable@0  
bb  org.mapreduce.userscore.UserScore$ScoreWritable@1  
cc  org.mapreduce.userscore.UserScore$ScoreWritable@1  
cc  org.mapreduce.userscore.UserScore$ScoreWritable@0  
dd  org.mapreduce.userscore.UserScore$ScoreWritable@1  
dd  org.mapreduce.userscore.UserScore$ScoreWritable@0  
rr  org.mapreduce.userscore.UserScore$ScoreWritable@0  
ss  org.mapreduce.userscore.UserScore$ScoreWritable@1  
xx  org.mapreduce.userscore.UserScore$ScoreWritable@1

代码：

package org.mapreduce.userscore;

import java.io.*;
import java.util.*;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;


public class UserScore  {

 public static class ScoreWritable implements Writable {
            private IntWritable N;
            private IntWritable M;

            //Default Constructor
            public ScoreWritable() {
                this.N = new IntWritable();
                this.M = new IntWritable();
            }

            //Custom constructor
            public ScoreWritable(IntWritable N, IntWritable M){
                this.N = N;
                this.M = M;
            }

            //Setter method to set the values of ScoreWritable objects
            public void set(IntWritable NN,IntWritable MM) {
                this.N = NN;
                this.M = MM;
            }

            //to get the first object from Score Record
            public IntWritable getN() {
                return N;
            }

            //to get the second object from Score Record
            public IntWritable getM() {
                return M;
            }

            @Override
            //overriding default readFields method.
            //It de-serializes the byte stream data
            public void readFields(DataInput in) throws IOException {
                N.readFields(in);
                M.readFields(in);
            }

            @Override
            //It serializes object data into byte stream data
            public void write(DataOutput out) throws IOException {
                N.write(out);
                M.write(out);
            }

            //@Override
            //public boolean equals(Object o) {
                //if (o instanceof ScoreWritable) {
                //ScoreWritable other = (ScoreWritable) o;
                //return N.equals(other.N) && M.equals(other.M);
                //}
                //return false;
            //}

            @Override
            public int hashCode() {
                return N.hashCode();
            }

 }

 public static class Map extends Mapper<LongWritable, Text, Text, ScoreWritable> {
    private Text user = new Text();
    private ScoreWritable score = new ScoreWritable();
    private IntWritable NN = new IntWritable();
    private IntWritable MM = new IntWritable();

    public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        int iterator = 1;
        String line = value.toString();
        StringTokenizer tokenizer = new StringTokenizer(line);
        while (tokenizer.hasMoreTokens()) {
            user.set(tokenizer.nextToken());
            if (iterator == 1) {
                NN = new IntWritable(1);
                MM = new IntWritable(0);
                iterator += 1;
            } else {
                NN = new IntWritable(0);
                MM = new IntWritable(1);
            }
            score.set(NN,MM);
            context.write(user, score);
        }
    }
 }

 public static class Reduce extends Reducer<Text, ScoreWritable, Text, IntWritable> {
     private IntWritable resultf = new IntWritable();
     public void reduce(Text key, Iterable<ScoreWritable> values, Context context) throws IOException, InterruptedException {
        //int result = ((values.getN().get()) * (values.getM()).get());
        resultf.set(result);
        context.write(key, resultf = new IntWritable(2));
    }
 }

 public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();

    //Create a new Jar and set the driver class(this class) as the main class of jar:
    Job job = new Job(conf, "userscore");
    job.setJarByClass(UserScore.class);

    //Set the map and reduce classes in the job:
    job.setMapperClass(Map.class);
    job.setReducerClass(Reduce.class);
    job.setCombinerClass(Reduce.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    //job.setMapOutputKeyClass(Text.class);
    //job.setMapOutputValueClass(ScoreWritable.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setNumReduceTasks(4);

    //Set the input and the output path from the arguments
    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    //Run the job and wait for its completion
    System.exit(job.waitForCompletion(true) ? 0 : 1);
 }

}

我正在尝试编写一个Mapreduce代码来从文本文件中读取。文本文件在每行中都有一对字符串，这些字符串表示社交网络中的第一个跟随第二个用户的用户名。我正在尝试计算每个用户的关注者总数和关注用户名，然后将这两个数字相乘以形成每个用户的一种得分。

这个想法是为值创建一个Writable自定义类（ScoreWritable），并将用户名作为Text键和Value作为ScoreWritable类传输。如果你注意到我改变了Reduce的输出来输出一个常数＆＃34; 2＆＃34;，juts要检查，但输出就像你在上面看到的那样。

我做错了什么？

我在虚拟机中使用Cloudera映像来编译和运行jar文件。

Answer 1

您正在使用TextOutputFormat，它不知道如何打印（作为文本）您的自定义ScoreWritable，事实上它只输出ScoreWritable实例的字符串表示形式。我知道最快的解决方法是覆盖ScoreWritable的toString（）方法，例如

public String toString() {
    return "" + N.get() + "\t" + M.get();
}

或者您可以编写自己的Custom OutputFormat。例如，请参阅here

希望这会有所帮助

Answer 2

所以我设法使代码工作。如你所见，有一些问题：

管理自定义类中的数据流（我猜），感谢@gtosto建议使用ToString（）
在Reducer中错误使用变量。
Reducer中的错误迭代方法。

我还添加了一个单独的Combiner类来优化Mapper和Reducer之间的网络流。

这是最终的代码:(有评论）

package org.mapreduce.userscore;

import java.io.*;
import java.util.*;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;


public class UserScore  {
    // Defining a custum class that contains two IntWritabe values
    // this custom class will be used to hold the Value part of the key-value pairs between the mapper and reducers

 public static class ScoreWritable implements Writable {
            private IntWritable N;
            private IntWritable M;

            //Default Constructor
            public ScoreWritable() {
                this.N = new IntWritable();
                this.M = new IntWritable();
            }

            //Custom constructor
            public ScoreWritable(IntWritable N, IntWritable M){
                this.N = N;
                this.M = M;
            }

            //Setter method to set the values of ScoreWritable objects
            public void set(IntWritable NN,IntWritable MM) {
                this.N = NN;
                this.M = MM;
            }

            //to get the first object from Score Record
            public IntWritable getN() {
                return N;
            }

            //to get the second object from Score Record
            public IntWritable getM() {
                return M;
            }

            @Override
            //overriding default readFields method.
            //It de-serializes the byte stream data
            public void readFields(DataInput in) throws IOException {
                N.readFields(in);
                M.readFields(in);
            }

            @Override
            //It serializes object data into byte stream data
            public void write(DataOutput out) throws IOException {
                N.write(out);
                M.write(out);
            }

            @Override
            //OrganizING the data stream in this custom class
            public String toString() {
                return "" + N.get() + "\t" + M.get();
            }


            @Override
            public int hashCode() {
                return N.hashCode();
            }

 }

 public static class Map extends Mapper<LongWritable, Text, Text, ScoreWritable> {
    private Text user = new Text();
    private ScoreWritable score = new ScoreWritable();  //variabe sscore will hold the pair (N,M) for eatch user
    private IntWritable NN = new IntWritable();
    private IntWritable MM = new IntWritable();

    public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        int iterator = 1;
        // tokenizing: variable tokenizer will hold the first username then the second username in each ine of the input text file
        String line = value.toString();
        StringTokenizer tokenizer = new StringTokenizer(line);
        while (tokenizer.hasMoreTokens()) {
            user.set(tokenizer.nextToken());
            if (iterator == 1) {                         // here variabe tokenizer holds the first username
                NN = new IntWritable(1);                 // saying that this user (username1) is folowing ssomeone
                MM = new IntWritable(0);
                iterator += 1;
            } else {                                     // here variabe tokenizer will hold the second username
                NN = new IntWritable(0);
                MM = new IntWritable(1);                 // saying that this user (username2) is being followed by someone
            }
            score.set(NN,MM);                            // giving eiter (1,0) or (0,1) to variable score
            context.write(user, score);                  // assigning variable score for each user in each line
        }   // emitting [Ali, (1,0)] or [Ali, (0,1)] means that Ali is following someone or being followed by someone, respectively.
    }       // next: the Reducer will go through all the values for each key, sum the total internal values of the key.
 }

    public static class Combine extends Reducer<Text, ScoreWritable, Text, ScoreWritable> {
        private IntWritable resultf = new IntWritable();
        private IntWritable NNN = new IntWritable();
        private IntWritable MMM = new IntWritable();
        public void reduce(Text key, Iterable<ScoreWritable> values, Reducer<Text, ScoreWritable, Text, ScoreWritable>.Context context)
                throws IOException, InterruptedException {
            int sum1 = 0;
            int sum2 = 0;
            for (ScoreWritable val:values) {
                sum1 += val.getN().get();
                sum2 += val.getM().get();
            }
            NNN = new IntWritable(sum1);
            MMM = new IntWritable(sum2);
            context.write(key, new ScoreWritable(NNN, MMM));    // this will combine all the values for each key before emitting the new pairs to Reduce function
        }
    }

 public static class Reduce extends Reducer<Text, ScoreWritable, Text, IntWritable> {
     private IntWritable resultf = new IntWritable();
     public void reduce(Text key, Iterable<ScoreWritable> values, Reducer<Text, ScoreWritable, Text, IntWritable>.Context context)
             throws IOException, InterruptedException {
         int sum3 = 0;
         int sum4 = 0;
         for (ScoreWritable val:values) {
             sum3 = val.getN().get();                // if the current user is following 20 people, then Sum3 = 20
             sum4 = val.getM().get();                // if the current user is being followed by 30 people, then Sum4 = 30
         }
         int result = sum3 * sum4;
         resultf.set(result);
         context.write(key, resultf);                // this will emit the current user and his/her corresponding score
    }
 }

 public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();

    //Create a new Jar and set the driver class(this class) as the main class of jar:
    Job job = new Job(conf, "userscore");
    job.setJarByClass(UserScore.class);

    //Set the map and reduce classes in the job:
    job.setMapperClass(Map.class);
    job.setReducerClass(Reduce.class);
    job.setCombinerClass(Combine.class);                 //activated unique combiner class which is different than the Reducer's IO is different

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setMapOutputKeyClass(Text.class);                //assigning output class for mapper since it is different than the Reducer's output class
    job.setMapOutputValueClass(ScoreWritable.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setNumReduceTasks(4);                            //assigning 4 reducers

    //Set the input and the output path from the arguments
    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    //Run the job and wait for its completion
    System.exit(job.waitForCompletion(true) ? 0 : 1);
 }

}

这是4个输出文本文件之一的一部分：

user0   2745
user1001    18724
user1005    2405
user1009    16577
user1012    1710
user1016    10074
user1023    2173
user1027    791

减少功能不影响最终输出

2 个答案: