Question

我正在尝试实现reduce side join，并使用mapfile reader来查找分布式缓存，但是当在stderr中检查时它没有查找值，它显示以下错误，lookupfile文件已经存在于hdfs中，并且似乎正确加载到缓存中，如stdout中所示。

java.lang.IllegalArgumentException：错误的FS：文件：/应用/ hadoop的/ TMP / mapred /本地/的TaskTracker / distcache / -8118663285704962921_-1196516983_170706299 /本地主机/输入/ delivery_status / DeliveryStatusCodes /数据，预期：hdfs：// localhost：9000 at org.apache.hadoop.fs.FileSystem.checkPath（FileSystem.java:390）at at org.apache.hadoop.hdfs.DistributedFileSystem.getPathName（DistributedFileSystem.java:140）在 org.apache.hadoop.hdfs.DistributedFileSystem.getFileStatus（DistributedFileSystem.java:554）在org.apache.hadoop.fs.FileSystem.getLength（FileSystem.java:816）at org.apache.hadoop.io.SequenceFile $ Reader。（SequenceFile.java:1479）at at org.apache.hadoop.io.SequenceFile $ Reader。（SequenceFile.java:1474）at org.apache.hadoop.io.MapFile $ Reader.createDataFileReader（MapFile.java:302）在org.apache.hadoop.io.MapFile $ Reader.open（MapFile.java:284）at at org.apache.hadoop.io.MapFile $ Reader。（MapFile.java:273）at at org.apache.hadoop.io.MapFile $ Reader。（MapFile.java:260）at at org.apache.hadoop.io.MapFile $ Reader。（MapFile.java:253）at mr_poc.reducerrsj.initializeDepartmentsMap（reducerrsj.java:59）at mr_poc.reducerrsj.setup（reducerrsj.java:42）at org.apache.hadoop.mapreduce.Reducer.run（Reducer.java:174）at at org.apache.hadoop.mapred.ReduceTask.runNewReducer（ReduceTask.java:649）在org.apache.hadoop.mapred.ReduceTask.run（ReduceTask.java:418）at org.apache.hadoop.mapred.Child $ 4.run（Child.java:255）at java.security.AccessController.doPrivileged（Native Method）at javax.security.auth.Subject.doAs（Subject.java:416）at org.apache.hadoop.security.UserGroupInformation.doAs（UserGroupInformation.java:1190）在org.apache.hadoop.mapred.Child.main（Child.java:249） java.lang.NullPointerException at mr_poc.reducerrsj.buildOutputValue（reducerrsj.java:83）at mr_poc.reducerrsj.reduce（reducerrsj.java:127）at mr_poc.reducerrsj.reduce（reducerrsj.java:1）at org.apache.hadoop.mapreduce.Reducer.run（Reducer.java:177）at at org.apache.hadoop.mapred.ReduceTask.runNewReducer（ReduceTask.java:649）在org.apache.hadoop.mapred.ReduceTask.run（ReduceTask.java:418）at org.apache.hadoop.mapred.Child $ 4.run（Child.java:255）at java.security.AccessController.doPrivileged（Native Method）at javax.security.auth.Subject.doAs（Subject.java:416）at org.apache.hadoop.security。

这是我的驱动程序代码，

package mr_poc;

import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class driverrsj extends Configured implements Tool{

    @Override
    public int run(String[] arg) throws Exception {
if(arg.length != 3)
{
    System.out.printf("3 parameters are required for DriverRSJ- <Input Dir1> <Input Dir2> <Output Dir> \n");
    return -1;
}
Job job = new Job(getConf());
Configuration conf = job.getConfiguration();
DistributedCache.addCacheFile(new URI("/input/delivery_status"), conf);
System.out.println("Cache : " + job.getConfiguration().get("mapred.cache.files"));
job.setJarByClass(driverrsj.class);
conf.setInt("cust_info", 1);
conf.setInt("status", 2);
StringBuilder inputPaths = new StringBuilder();
inputPaths.append(arg[0].toString()).append(",").append(arg[1].toString());
FileInputFormat.setInputPaths(job, inputPaths.toString());
FileOutputFormat.setOutputPath(job, new Path(arg[2]));
job.setJarByClass(driverrsj.class);
job.setMapperClass(mappperRSJ.class);
job.setReducerClass(reducerrsj.class);
job.setMapOutputKeyClass(CompositeKeyWritableRSJ.class);
job.setMapOutputValueClass(Text.class);
//job.setPartitionerClass(partinonrsj.class);
job.setSortComparatorClass(secondarysortcomp.class);
job.setGroupingComparatorClass(GroupingComparatorRSJ.class);
job.setNumReduceTasks(1);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);


boolean success =job.waitForCompletion(true);
return success? 0 : 1;

    }

    public static void main(String[] args) throws Exception{
        int exitCode = ToolRunner.run(new Configuration(), new driverrsj(),args);
        System.exit(exitCode);

    }


}

这是我的减速机代码

package mr_poc;

import java.io.File;
import java.io.IOException;
import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.MapFile;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class reducerrsj extends Reducer<CompositeKeyWritableRSJ, Text, NullWritable, Text>{
    StringBuilder reduceValueBuilder = new StringBuilder("");
    NullWritable nullWritableKey = NullWritable.get();
    Text reduceOutputValue = new Text("");
    String strSeparator = ",";
    private MapFile.Reader deptMapReader = null;
    Text txtMapFileLookupKey = new Text();
    Text txtMapFileLookupValue = new Text();
    //Path[] cacheFilesLocal;
    //Path[] eachPath;

    @Override
    protected void setup(Context context) throws IOException,InterruptedException {
        Path[] cacheFiles = DistributedCache.getLocalCacheFiles(context.getConfiguration());



        for ( Path eachPath : cacheFiles){

            System.out.println(eachPath.toString());
              System.out.println(eachPath.getName());
            if(eachPath.getName().toString().contains("delivery_status"))
            {

                URI uriUncompressedFile = new File(eachPath.toString()+ "/DeliveryStatusCodes").toURI();
                initializeDepartmentsMap(uriUncompressedFile, context);

            }
            }
        }

    //@SuppressWarnings("deprecation")
    private void initializeDepartmentsMap(URI uriUncompressedFile, Context context)
    throws IOException {
    // {{
    // Initialize the reader of the map file (side data)
        Configuration conf = context.getConfiguration();
        conf.addResource(new Path("/usr/local/hadoop-1.2.1/conf/core-site.xml"));
        FileSystem dfs = FileSystem.get(conf);
    try {


    deptMapReader = new MapFile.Reader(dfs,uriUncompressedFile.toString(), context.getConfiguration());
    } catch (Exception e) {
    e.printStackTrace();
    }
    // }}
    }
    private StringBuilder buildOutputValue(CompositeKeyWritableRSJ key,
            StringBuilder reduceValueBuilder, Text value) {

            if (key.getsourceindex() == 2) {


            String arrSalAttributes[] = value.toString().split(",");
            txtMapFileLookupKey.set(arrSalAttributes[0].toString());
            System.out.println("key=" + txtMapFileLookupKey);


            try {

            deptMapReader.get(txtMapFileLookupKey, txtMapFileLookupValue);
            }
             catch (Exception e) {
            txtMapFileLookupValue.set("");
                e.printStackTrace();
            } finally {
            txtMapFileLookupValue
            .set((txtMapFileLookupValue.equals(null) || txtMapFileLookupValue
            .equals("")) ? "NOT-FOUND"
            : txtMapFileLookupValue.toString());
            }

            reduceValueBuilder.append(txtMapFileLookupValue.toString());


            } else if(key.getsourceindex() == 1) {

            String arrEmpAttributes[] = value.toString().split(",");
            reduceValueBuilder.append(arrEmpAttributes[0].toString()).append(
            strSeparator);
            } 



            txtMapFileLookupKey.set("");
            txtMapFileLookupValue.set("");

            return reduceValueBuilder;
    }

    @Override
    public void reduce(CompositeKeyWritableRSJ key, Iterable<Text> values,
    Context context) throws IOException, InterruptedException {


    for (Text value : values) {
    buildOutputValue(key, reduceValueBuilder, value);
    }

    // Drop last comma, set value, and emit output
    if (reduceValueBuilder.length() > 1) {

    //reduceValueBuilder.setLength(reduceValueBuilder.length() - 1);
    // Emit output
    reduceOutputValue.set(reduceValueBuilder.toString());
    context.write(nullWritableKey, reduceOutputValue);
    } else {
    System.out.println("Key=" + key.getjoinkey() + "src="
    + key.getsourceindex());

    }
    // Reset variables
    reduceValueBuilder.setLength(0);
    reduceOutputValue.set("");

    }
    @Override
    protected void cleanup(Context context) throws IOException,
    InterruptedException {
         if(deptMapReader != null)
         {
deptMapReader.close();
    }
    }
}

这是我的核心网站-Xml

<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<!-- Put site-specific property overrides in this file. -->

<configuration>
<property>
  <name>hadoop.tmp.dir</name>
  <value>/app/hadoop/tmp</value>
  <description>A base for other temporary directories.</description>
</property>
<property>
  <name>fs.default.name</name>
  <value>hdfs://localhost:9000</value>
  <description>The name of the default file system.  A URI whose
  scheme and authority determine the FileSystem implementation.  The
  uri's scheme determines the config property (fs.SCHEME.impl) naming
  the FileSystem implementation class.  The uri's authority is used to
  determine the host, port, etc. for a filesystem.</description>
</property>
</configuration>

任何帮助都将受到高度赞赏。在此先感谢!!!

Answer 1

我有同样的问题，我通过添加

解决了这个问题

FileSystem fs = FileSystem.get(new URI("hdfs://localhost:9000"),conf)

在驱动程序类中。

您必须从URI

导入java.net.URI

Answer 2

我想我也遇到过类似的问题。这个问题的关键点是你要从DistributedCache操作一个SequenceFile，它应该在你的本地文件系统上。从您的日志中，有一行

"org.apache.hadoop.hdfs.DistributedFileSystem.getPathName(DistributedFileSystem.java:140)"

如果您可以查看SequenceFile.Reader的源代码，您会发现该日志是由此代码引起的

fs.getFileStatus(filename).getLen()

这个＆＃34; fs＆＃34;这里应该是LocalFileSystem而不是DistributedFileSystem。

我的解决方案是改变

deptMapReader = new MapFile.Reader(dfs,uriUncompressedFile.toString(), context.getConfiguration());

到

JobConf conf = context.getConfiguration();
String originalFS = conf.get("fs.default.name");   //backup original configuration
conf.set("fs.default.name", "file:///");           //change configuration to local file system
deptMapReader = new MapFile.Reader(dfs,uriUncompressedFile.toString(), conf);
conf.set("fs.default.name", originalFS);           //restore original configuration

执行此操作后，SequenceFile.Reader对象可以访问本地文件系统上的缓存文件。

我认为这个问题的发生是因为SequenceFile API发生了变化，在这种情况下，一些像SequenceFile.Reader的API（例如MapFile.Reader（fs，path，conf））已被弃用。

此解决方案适用于我。

Answer 3

您应该根据conf文件设置 core-site.xml 的属性，如下所示：

conf.set("fs.defaultFS", "hdfs://host:port");
conf.set("mapreduce.jobtracker.address", "host:port");

Answer 4

在职业选手中包括以下内容： DistributedCache.addCacheFile（新URI（＆＃34;＆＃34;），conf）;

在mapper的设置方法中的代码下面

@Override
protected void setup(Context context) throws IOException, InterruptedException {
    Configuration configuration = new Configuration();
    FileSystem fileSystem = null;
    try {
         fileSystem = FileSystem.get(new URI("<File location"),configuration);
    } catch (URISyntaxException e) {
        e.printStackTrace();
    }

    String location = <S3 file location>;
    FSDataInputStream fsDataInputStream =fileSystem.open(new Path(location));
    Scanner scanner = new Scanner(fsDataInputStream);
    int i = 1;
    while(scanner.hasNextLine()) {
        String str[] = scanner.nextLine().split(",");
        LOG.info("keys are \t" + str[0] + str[1]);
        stickerMap.put(str[0] + str[1], i);
        ++i;
    }
}

java.lang.IllegalArgumentException：Wrong FS：，expected：hdfs：// localhost：9000

4 个答案: