Question

我需要运行2个map reduce作业，以便第2个作为输入从第一个作业输出。我想在一次调用中执行此操作，其中MyClass扩展Configured并实现Tool。

我已经编写了代码，只要我不在同一个调用中运行这两个作业，它就能正常工作（这个工作）：

hadoop jar myjar.jar path.to.my.class.MyClass -i input -o output -m job1
hadoop jar myjar.jar path.to.my.class.MyClass -i dummy -o output -m job2

但这不是：

hadoop jar myjar.jar path.to.my.class.MyClass -i input -o output -m all

（ - m代表“模式”）

在这种情况下，第一个作业的输出没有进入第二个作业的映射器（我通过调试得出这个），但我无法弄清楚原因。

我看过其他关于链接的帖子，但它们是针对“旧”的mapred api。我需要在作业之间运行第三方代码，所以我不知道ChainMapper / ChainReducer是否适用于我的用例。

使用hadoop版本1.0.3，AWS Elastic MapReduce分发。

代码：

import java.io.IOException;

import org.apache.commons.cli.BasicParser;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat;
import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;


public class MyClass extends Configured implements Tool  {

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(new Configuration(), new HBasePrep(), args);
        System.exit(res);
    }

    @Override
    public int run(String[] args) throws Exception {

        CommandLineParser parser = new BasicParser();
        Options allOptions = setupOptions();
        Configuration conf = getConf();
        String[] argv_ = new GenericOptionsParser(conf, args).getRemainingArgs();
        CommandLine cmdLine = parser.parse(allOptions, argv_);

        boolean doJob1 = true;
        boolean doJob2 = true;
        if (cmdLine.hasOption('m')) {
            String mode = cmdLine.getOptionValue('m');
            if ("job1".equals(mode)) {
                doJob2 = false;
            } else if ("job2".equals(mode)){
                doJob1 = false;
            }
        }

        Path outPath = new Path(cmdLine.getOptionValue("output"), "job1out");

        Job job = new Job(conf, "HBase Prep For Data Build");
        Job job2 = new Job(conf, "HBase SessionIndex load");

        if (doJob1) {
            conf = job.getConfiguration();

            String[] values = cmdLine.getOptionValues("input");
            if (values != null && values.length > 0) {
                for (String input : values) {
                    System.out.println("input:" + input);
                    FileInputFormat.addInputPaths(job, input);
                }
            }

            job.setJarByClass(HBasePrep.class);
            job.setMapperClass(SessionMapper.class);

            MultipleOutputs.setCountersEnabled(job, false);
            MultipleOutputs.addNamedOutput(job, "sessionindex", TextOutputFormat.class, Text.class, Text.class);

            job.setMapOutputKeyClass(ImmutableBytesWritable.class);
            job.setMapOutputValueClass(KeyValue.class);
            job.setOutputFormatClass(HFileOutputFormat.class);

            HTable hTable = new HTable(conf, "session");
            // Auto configure partitioner and reducer
            HFileOutputFormat.configureIncrementalLoad(job, hTable);

            FileOutputFormat.setOutputPath(job, outPath);

            if (!job.waitForCompletion(true)) {
                return 1;
            }
            // Load generated HFiles into table
            LoadIncrementalHFiles loader = new LoadIncrementalHFiles(conf);
            loader.doBulkLoad(outPath, hTable);
            FileSystem fs = FileSystem.get(outPath.toUri(), conf);
            fs.delete(new Path(outPath, "cf"), true); # i delete this because after the hbase build load, it is left an empty directory which causes problems later
        }

        /////////////////////////////////////////////
        // SECOND JOB                              //
        /////////////////////////////////////////////

        if (doJob2) {
            conf = job2.getConfiguration();

            System.out.println("-- job 2 input path : " + outPath.toString());
            FileInputFormat.setInputPaths(job2, outPath.toString());

            job2.setJarByClass(HBasePrep.class);
            job2.setMapperClass(SessionIndexMapper.class);

            MultipleOutputs.setCountersEnabled(job2, false);

            job2.setMapOutputKeyClass(ImmutableBytesWritable.class);
            job2.setMapOutputValueClass(KeyValue.class);
            job2.setOutputFormatClass(HFileOutputFormat.class);

            HTable hTable = new HTable(conf, "session_index_by_hour");
            // Auto configure partitioner and reducer
            HFileOutputFormat.configureIncrementalLoad(job2, hTable);

            outPath = new Path(cmdLine.getOptionValue("output"), "job2out");
            System.out.println("-- job 2 output path: " + outPath.toString());
            FileOutputFormat.setOutputPath(job2, outPath);
            if (!job2.waitForCompletion(true)) {
                return 2;
            }
            // Load generated HFiles into table
            LoadIncrementalHFiles loader = new LoadIncrementalHFiles(conf);
            loader.doBulkLoad(outPath, hTable);
        }
        return 0;
    }

    public static class SessionMapper extends
        Mapper<LongWritable, Text, ImmutableBytesWritable, KeyValue> {

        private MultipleOutputs<ImmutableBytesWritable, KeyValue> multiOut;

        @Override
        public void setup(Context context) throws IOException {
            multiOut = new MultipleOutputs<ImmutableBytesWritable, KeyValue>(context);
        }

        @Override
        public void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {
            ...
            context.write(..., ...); # this is called mutiple times
            multiOut.write("sessionindex", new Text(...), new Text(...), "sessionindex");
        }
    }

    public static class SessionIndexMapper extends
            Mapper<LongWritable, Text, ImmutableBytesWritable, KeyValue> {

        @Override
        public void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {
            context.write(new ImmutableBytesWritable(...), new KeyValue(...));
        }
    }

    private static Options setupOptions() {
        Option input = createOption("i", "input",
                "input file(s) for the Map step", "path", Integer.MAX_VALUE,
                true);
        Option output = createOption("o", "output",
                "output directory for the Reduce step", "path", 1, true);
        Option mode = createOption("m", "mode",
                "what mode ('all', 'job1', 'job2')", "-mode-", 1, false);
        return new Options().addOption(input).addOption(output)
                .addOption(mode);
    }

    public static Option createOption(String name, String longOpt, String desc,
            String argName, int max, boolean required) {
        OptionBuilder.withArgName(argName);
        OptionBuilder.hasArgs(max);
        OptionBuilder.withDescription(desc);
        OptionBuilder.isRequired(required);
        OptionBuilder.withLongOpt(longOpt);
        return OptionBuilder.create(name);
    }
}

输出（单次调用）：

input:s3n://...snip...
13/12/09 23:08:43 INFO util.NativeCodeLoader: Loaded the native-hadoop library
13/12/09 23:08:43 INFO zlib.ZlibFactory: Successfully loaded & initialized native-zlib library
13/12/09 23:08:43 INFO compress.CodecPool: Got brand-new compressor
13/12/09 23:08:43 INFO mapred.JobClient: Default number of map tasks: null
13/12/09 23:08:43 INFO mapred.JobClient: Setting default number of map tasks based on cluster size to : 2
13/12/09 23:08:43 INFO mapred.JobClient: Default number of reduce tasks: 1
13/12/09 23:08:43 INFO security.ShellBasedUnixGroupsMapping: add hadoop to shell userGroupsCache
13/12/09 23:08:43 INFO mapred.JobClient: Setting group to hadoop
13/12/09 23:08:43 INFO input.FileInputFormat: Total input paths to process : 1
13/12/09 23:08:43 INFO lzo.GPLNativeCodeLoader: Loaded native gpl library
13/12/09 23:08:43 WARN lzo.LzoCodec: Could not find build properties file with revision hash
13/12/09 23:08:43 INFO lzo.LzoCodec: Successfully loaded & initialized native-lzo library [hadoop-lzo rev UNKNOWN]
13/12/09 23:08:43 WARN snappy.LoadSnappy: Snappy native library is available
13/12/09 23:08:43 INFO snappy.LoadSnappy: Snappy native library loaded
13/12/09 23:08:44 INFO mapred.JobClient: Running job: job_201312062235_0044
13/12/09 23:08:45 INFO mapred.JobClient:  map 0% reduce 0%
13/12/09 23:09:09 INFO mapred.JobClient:  map 100% reduce 0%
13/12/09 23:09:27 INFO mapred.JobClient:  map 100% reduce 100%
13/12/09 23:09:32 INFO mapred.JobClient: Job complete: job_201312062235_0044
13/12/09 23:09:32 INFO mapred.JobClient: Counters: 42
13/12/09 23:09:32 INFO mapred.JobClient:   MyCounter1
13/12/09 23:09:32 INFO mapred.JobClient:     ValidCurrentDay=3526
13/12/09 23:09:32 INFO mapred.JobClient:   Job Counters
13/12/09 23:09:32 INFO mapred.JobClient:     Launched reduce tasks=1
13/12/09 23:09:32 INFO mapred.JobClient:     SLOTS_MILLIS_MAPS=19693
13/12/09 23:09:32 INFO mapred.JobClient:     Total time spent by all reduces waiting after reserving slots (ms)=0
13/12/09 23:09:32 INFO mapred.JobClient:     Total time spent by all maps waiting after reserving slots (ms)=0
13/12/09 23:09:32 INFO mapred.JobClient:     Rack-local map tasks=1
13/12/09 23:09:32 INFO mapred.JobClient:     Launched map tasks=1
13/12/09 23:09:32 INFO mapred.JobClient:     SLOTS_MILLIS_REDUCES=15201
13/12/09 23:09:32 INFO mapred.JobClient:   File Output Format Counters
13/12/09 23:09:32 INFO mapred.JobClient:     Bytes Written=1979245
13/12/09 23:09:32 INFO mapred.JobClient:   FileSystemCounters
13/12/09 23:09:32 INFO mapred.JobClient:     S3N_BYTES_READ=51212
13/12/09 23:09:32 INFO mapred.JobClient:     FILE_BYTES_READ=400417
13/12/09 23:09:32 INFO mapred.JobClient:     HDFS_BYTES_READ=231
13/12/09 23:09:32 INFO mapred.JobClient:     FILE_BYTES_WRITTEN=859881
13/12/09 23:09:32 INFO mapred.JobClient:     HDFS_BYTES_WRITTEN=2181624
13/12/09 23:09:32 INFO mapred.JobClient:   File Input Format Counters
13/12/09 23:09:32 INFO mapred.JobClient:     Bytes Read=51212
13/12/09 23:09:32 INFO mapred.JobClient:   MyCounter2
13/12/09 23:09:32 INFO mapred.JobClient:     ASCII=3526
13/12/09 23:09:32 INFO mapred.JobClient:   StatsUnaggregatedMapEventTypeCurrentDay
13/12/09 23:09:32 INFO mapred.JobClient:     adProgress0=343
13/12/09 23:09:32 INFO mapred.JobClient:     asset=562
13/12/09 23:09:32 INFO mapred.JobClient:     podComplete=612
13/12/09 23:09:32 INFO mapred.JobClient:     adProgress100=247
13/12/09 23:09:32 INFO mapred.JobClient:     adProgress25=247
13/12/09 23:09:32 INFO mapred.JobClient:     click=164
13/12/09 23:09:32 INFO mapred.JobClient:     adProgress50=247
13/12/09 23:09:32 INFO mapred.JobClient:     adCall=244
13/12/09 23:09:32 INFO mapred.JobClient:     adProgress75=247
13/12/09 23:09:32 INFO mapred.JobClient:     podStart=613
13/12/09 23:09:32 INFO mapred.JobClient:   Map-Reduce Framework
13/12/09 23:09:32 INFO mapred.JobClient:     Map output materialized bytes=400260
13/12/09 23:09:32 INFO mapred.JobClient:     Map input records=3526
13/12/09 23:09:32 INFO mapred.JobClient:     Reduce shuffle bytes=400260
13/12/09 23:09:32 INFO mapred.JobClient:     Spilled Records=14104
13/12/09 23:09:32 INFO mapred.JobClient:     Map output bytes=2343990
13/12/09 23:09:32 INFO mapred.JobClient:     Total committed heap usage (bytes)=497549312
13/12/09 23:09:32 INFO mapred.JobClient:     CPU time spent (ms)=10120
13/12/09 23:09:32 INFO mapred.JobClient:     Combine input records=0
13/12/09 23:09:32 INFO mapred.JobClient:     SPLIT_RAW_BYTES=231
13/12/09 23:09:32 INFO mapred.JobClient:     Reduce input records=7052
13/12/09 23:09:32 INFO mapred.JobClient:     Reduce input groups=246
13/12/09 23:09:32 INFO mapred.JobClient:     Combine output records=0
13/12/09 23:09:32 INFO mapred.JobClient:     Physical memory (bytes) snapshot=519942144
13/12/09 23:09:32 INFO mapred.JobClient:     Reduce output records=7052
13/12/09 23:09:32 INFO mapred.JobClient:     Virtual memory (bytes) snapshot=3076526080
13/12/09 23:09:32 INFO mapred.JobClient:     Map output records=7052
13/12/09 23:09:32 WARN mapreduce.LoadIncrementalHFiles: Skipping non-directory hdfs://10.91.18.96:9000/path/job1out/_SUCCESS
13/12/09 23:09:32 WARN mapreduce.LoadIncrementalHFiles: Skipping non-directory hdfs://10.91.18.96:9000/path/job1out/sessionindex-m-00000
1091740526
-- job 2 input path : /path/job1out
-- job 2 output path: /path/job2out
13/12/09 23:09:32 INFO mapred.JobClient: Default number of map tasks: null
13/12/09 23:09:32 INFO mapred.JobClient: Setting default number of map tasks based on cluster size to : 2
13/12/09 23:09:32 INFO mapred.JobClient: Default number of reduce tasks: 1
13/12/09 23:09:33 INFO mapred.JobClient: Setting group to hadoop
13/12/09 23:09:33 INFO input.FileInputFormat: Total input paths to process : 1
13/12/09 23:09:33 INFO mapred.JobClient: Running job: job_201312062235_0045
13/12/09 23:09:34 INFO mapred.JobClient:  map 0% reduce 0%
13/12/09 23:09:51 INFO mapred.JobClient:  map 100% reduce 0%
13/12/09 23:10:03 INFO mapred.JobClient:  map 100% reduce 33%
13/12/09 23:10:06 INFO mapred.JobClient:  map 100% reduce 100%
13/12/09 23:10:11 INFO mapred.JobClient: Job complete: job_201312062235_0045
13/12/09 23:10:11 INFO mapred.JobClient: Counters: 27
13/12/09 23:10:11 INFO mapred.JobClient:   Job Counters
13/12/09 23:10:11 INFO mapred.JobClient:     Launched reduce tasks=1
13/12/09 23:10:11 INFO mapred.JobClient:     SLOTS_MILLIS_MAPS=13533
13/12/09 23:10:11 INFO mapred.JobClient:     Total time spent by all reduces waiting after reserving slots (ms)=0
13/12/09 23:10:11 INFO mapred.JobClient:     Total time spent by all maps waiting after reserving slots (ms)=0
13/12/09 23:10:11 INFO mapred.JobClient:     Launched map tasks=1
13/12/09 23:10:11 INFO mapred.JobClient:     SLOTS_MILLIS_REDUCES=12176
13/12/09 23:10:11 INFO mapred.JobClient:   File Output Format Counters
13/12/09 23:10:11 INFO mapred.JobClient:     Bytes Written=0
13/12/09 23:10:11 INFO mapred.JobClient:   FileSystemCounters
13/12/09 23:10:11 INFO mapred.JobClient:     FILE_BYTES_READ=173
13/12/09 23:10:11 INFO mapred.JobClient:     HDFS_BYTES_READ=134
13/12/09 23:10:11 INFO mapred.JobClient:     FILE_BYTES_WRITTEN=57735
13/12/09 23:10:11 INFO mapred.JobClient:   File Input Format Counters
13/12/09 23:10:11 INFO mapred.JobClient:     Bytes Read=0
13/12/09 23:10:11 INFO mapred.JobClient:   Map-Reduce Framework
13/12/09 23:10:11 INFO mapred.JobClient:     Map output materialized bytes=16
13/12/09 23:10:11 INFO mapred.JobClient:     Map input records=0
13/12/09 23:10:11 INFO mapred.JobClient:     Reduce shuffle bytes=16
13/12/09 23:10:11 INFO mapred.JobClient:     Spilled Records=0
13/12/09 23:10:11 INFO mapred.JobClient:     Map output bytes=0
13/12/09 23:10:11 INFO mapred.JobClient:     Total committed heap usage (bytes)=434634752
13/12/09 23:10:11 INFO mapred.JobClient:     CPU time spent (ms)=2270
13/12/09 23:10:11 INFO mapred.JobClient:     Combine input records=0
13/12/09 23:10:11 INFO mapred.JobClient:     SPLIT_RAW_BYTES=134
13/12/09 23:10:11 INFO mapred.JobClient:     Reduce input records=0
13/12/09 23:10:11 INFO mapred.JobClient:     Reduce input groups=0
13/12/09 23:10:11 INFO mapred.JobClient:     Combine output records=0
13/12/09 23:10:11 INFO mapred.JobClient:     Physical memory (bytes) snapshot=423612416
13/12/09 23:10:11 INFO mapred.JobClient:     Reduce output records=0
13/12/09 23:10:11 INFO mapred.JobClient:     Virtual memory (bytes) snapshot=3058089984
13/12/09 23:10:11 INFO mapred.JobClient:     Map output records=0
13/12/09 23:10:11 WARN mapreduce.LoadIncrementalHFiles: Skipping non-directory hdfs://10.91.18.96:9000/path/job2out/_SUCCESS
13/12/09 23:10:11 WARN mapreduce.LoadIncrementalHFiles: Bulk load operation did not find any files to load in directory /path/job2out.  Does it contain files in subdirectories that correspond to column family names?

Hadoop MapReduce：在一个Configured / Tool中运行两个作业，在第二个作业上不产生任何输出

0 个答案: