Dataset<Row> kafkaStreamSet = sparkSession 
            .option("kafka.bootstrap.servers", kafkaBootstrap) 
            .option("subscribe", kafkaTopic) 
            .option("startingOffsets", "latest") 
            .option("failOnDataLoss", false) 
            .option("maxOffsetsPerTrigger", offsetsPerTrigger) 

    //raw message to ClickStream 
    Dataset<ClickStream> ds1 = kafkaStreamSet.mapPartitions(processClickStreamMessages, Encoders.bean(ClickStream.class));   


1)BookingRequest.java if value1,
 2)PropertyPageView.java if value2,


    //fetch BookingRequests in the ClickStream 
    Dataset<BookingRequest> ds2 = ds1.map(filterBookingRequests,Encoders.bean(BookingRequest.class)); 

    //fetch PropertyPageViews in the ClickStream 
    Dataset<PropertyPageView> ds3 = ds1.map(filterPropertyPageViews,Encoders.bean(PropertyPageView.class)); 


   StreamingQuery bookingRequestsParquetStreamWriter = ds2.writeStream().outputMode("append") 
        .trigger(ProcessingTime.create(bookingRequestProcessingTime, TimeUnit.MILLISECONDS)) 
        .option("checkpointLocation",  "s3://" + s3Bucket+ "/checkpoint/bookingRequests") 
        .start("s3://" + s3Bucket+ "/" +  bookingRequestPath); 

    StreamingQuery PageViewsParquetStreamWriter = ds3.writeStream().outputMode("append") 
        .trigger(ProcessingTime.create(pageViewProcessingTime, TimeUnit.MILLISECONDS)) 
        .option("checkpointLocation",  "s3://" + s3Bucket+ "/checkpoint/PageViews") 
        .start("s3://" + s3Bucket+ "/" +  pageViewPath); 




引起:org.apache.spark.sql.streaming.StreamingQueryException:期望例如: {“topicA”:{“0”:23,“1”: - 1},“topicB”:{“0”: - 2}},得到{“userSessionEventJoin.global”:{“92”:154362528,“ 101 org.apache.spark.sql.kafka010.JsonUtils $ .partitionOffsets(JsonUtils.scala:74)         org.apache.spark.sql.kafka010.KafkaSourceOffset $ .apply(KafkaSourceOffset.scala:59)

如果我删除了所有的检查点信息,那么它会再次启动并在给定的2个位置开始新的检查点,但这意味着我必须再次从最新的偏移量开始处理并丢失所有先前的偏移量。 spark版本是2.1,本主题有100多个分区 我只使用一个写入流(一个检查点位置)进行测试,重新启动时会发生同样的异常。


1) Use getOrCreate API to create your spark streaming session.
   a) getOrCreate takes two parameters. A function "(....) => sparkSession" and the checkpoint directory.
2) When program starts for the first time, it uses the checkpoint directory to store it's inner details. (Among other uses)
3) When program crashes/"stops and restarted", the spark Streaming session is created from the checkpoint hence giving you what you want.

由于在堆栈溢出时不鼓励链接,我将把示例代码放在下面。但它基本上取自Spark Github example

            * Counts words in text encoded with UTF8 received from the network every second. This example also
            * shows how to use lazily instantiated singleton instances for Accumulator and Broadcast so that
            * they can be registered on driver failures.
            * Usage: JavaRecoverableNetworkWordCount <hostname> <port> <checkpoint-directory> <output-file>
            *   <hostname> and <port> describe the TCP server that Spark Streaming would connect to receive
            *   data. <checkpoint-directory> directory to HDFS-compatible file system which checkpoint data
            *   <output-file> file to which the word counts will be appended
            * <checkpoint-directory> and <output-file> must be absolute paths
            * To run this on your local machine, you need to first run a Netcat server
            *      `$ nc -lk 9999`
            * and run the example as
            *      `$ ./bin/run-example org.apache.spark.examples.streaming.JavaRecoverableNetworkWordCount \
            *              localhost 9999 ~/checkpoint/ ~/out`
            * If the directory ~/checkpoint/ does not exist (e.g. running for the first time), it will create
            * a new StreamingContext (will print "Creating new context" to the console). Otherwise, if
            * checkpoint data exists in ~/checkpoint/, then it will create StreamingContext from
            * the checkpoint data.
            * Refer to the online documentation for more details.
            public final class JavaRecoverableNetworkWordCount {
            private static final Pattern SPACE = Pattern.compile(" ");

            private static JavaStreamingContext createContext(String ip,
                                                                int port,
                                                                String checkpointDirectory,
                                                                String outputPath) {

                // If you do not see this printed, that means the StreamingContext has been loaded
                // from the new checkpoint
                System.out.println("Creating new context");
                File outputFile = new File(outputPath);
                if (outputFile.exists()) {
                SparkConf sparkConf = new SparkConf().setAppName("JavaRecoverableNetworkWordCount");
                // Create the context with a 1 second batch size
                JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(1));

                // Create a socket stream on target ip:port and count the
                // words in input stream of \n delimited text (eg. generated by 'nc')
                JavaReceiverInputDStream<String> lines = ssc.socketTextStream(ip, port);
                JavaDStream<String> words = lines.flatMap(x -> Arrays.asList(SPACE.split(x)).iterator());
                JavaPairDStream<String, Integer> wordCounts = words.mapToPair(s -> new Tuple2<>(s, 1))
                    .reduceByKey((i1, i2) -> i1 + i2);

                wordCounts.foreachRDD((rdd, time) -> {
                // Get or register the blacklist Broadcast
                Broadcast<List<String>> blacklist =
                    JavaWordBlacklist.getInstance(new JavaSparkContext(rdd.context()));
                // Get or register the droppedWordsCounter Accumulator
                LongAccumulator droppedWordsCounter =
                    JavaDroppedWordsCounter.getInstance(new JavaSparkContext(rdd.context()));
                // Use blacklist to drop words and use droppedWordsCounter to count them
                String counts = rdd.filter(wordCount -> {
                    if (blacklist.value().contains(wordCount._1())) {
                    return false;
                    } else {
                    return true;
                String output = "Counts at time " + time + " " + counts;
                System.out.println("Dropped " + droppedWordsCounter.value() + " word(s) totally");
                System.out.println("Appending to " + outputFile.getAbsolutePath());
                Files.append(output + "\n", outputFile, Charset.defaultCharset());

                return ssc;

            public static void main(String[] args) throws Exception {
                if (args.length != 4) {
                System.err.println("You arguments were " + Arrays.asList(args));
                    "Usage: JavaRecoverableNetworkWordCount <hostname> <port> <checkpoint-directory>\n" +
                    "     <output-file>. <hostname> and <port> describe the TCP server that Spark\n" +
                    "     Streaming would connect to receive data. <checkpoint-directory> directory to\n" +
                    "     HDFS-compatible file system which checkpoint data <output-file> file to which\n" +
                    "     the word counts will be appended\n" +
                    "\n" +
                    "In local mode, <master> should be 'local[n]' with n > 1\n" +
                    "Both <checkpoint-directory> and <output-file> must be absolute paths");

                String ip = args[0];
                int port = Integer.parseInt(args[1]);
                String checkpointDirectory = args[2];
                String outputPath = args[3];

                // Function to create JavaStreamingContext without any output operations
                // (used to detect the new context)
                Function0<JavaStreamingContext> createContextFunc =
                    () -> createContext(ip, port, checkpointDirectory, outputPath);

                JavaStreamingContext ssc =
                JavaStreamingContext.getOrCreate(checkpointDirectory, createContextFunc);

这是Spark版本2.1.0中的一个错误,在使用该版本运行时似乎在spark 2.1.1中得到修复。