我正在尝试使用 Pyspark 中的 foreachbach 将数据帧写入 Mongodb 接收器,但出现错误。我使用的是 spark 版本 2.4.7 和 python 3.7。当我尝试将数据帧编写为批处理时,相同的代码工作正常。我使用了 mongo-spark-connector_2.11:2.4.1 这是代码和错误消息:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
spark = SparkSession.builder \
.master('local[3]') \
.config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.11:2.4.1') \
.config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.7') \
.getOrCreate()
def mongoSink(df, batch_id):
df.write \
.format('mongo') \
.mode('append') \
.option('spark.mongodb.output.uri', 'mongodb://127.0.0.1:27017/db.cl') \
.save()
schema = StructType([....])
# Reading from Kafka topic
kafka_df = spark.readStream \
.format('kafka') \
.option('kafka.bootstrap.servers', 'localhost:9092') \
.option('subscribe', 'kTopic') \
.option('startingOffsets', 'latest') \
.load()
# Processing code
.
.
.
# Writing to MongoDB
write_df_mongodb = f_df.writeStream \
.format('mongo') \
.foreachBatch(mongoSink) \
.option("checkpointLocation", "chk_dir") \
.outputMode('append') \
.start()
write_df_mongodb.awaitTermination()
错误信息:
Py4JJavaError: An error occurred while calling o77.awaitTermination.
: org.apache.spark.sql.streaming.StreamingQueryException: An exception was raised by the Python Proxy. Return Message: Traceback (most recent call last):
File "D:\Spark2-4\python\lib\py4j-0.10.7-src.zip\py4j\java_gateway.py", line 2381, in _call_proxy
return_value = getattr(self.pool[obj_id], method)(*params)
File "D:\Spark2-4\python\pyspark\sql\utils.py", line 191, in call
raise e
File "D:\Spark2-4\python\pyspark\sql\utils.py", line 188, in call
self.func(DataFrame(jdf, self.sql_ctx), batch_id)
File "<ipython-input-5-e9393e49a072>", line 5, in mongoSink
.option('spark.mongodb.output.uri', 'mongodb://127.0.0.1:27017/db.cl') \
File "D:\Spark2-4\python\pyspark\sql\readwriter.py", line 737, in save
self._jwrite.save()
File "D:\Spark2-4\python\lib\py4j-0.10.7-src.zip\py4j\java_gateway.py", line 1257, in __call__
answer, self.gateway_client, self.target_id, self.name)
File "D:\Spark2-4\python\pyspark\sql\utils.py", line 63, in deco
return f(*a, **kw)
File "D:\Spark2-4\python\lib\py4j-0.10.7-src.zip\py4j\protocol.py", line 328, in get_return_value
format(target_id, ".", name), value)
py4j.protocol.Py4JJavaError: An error occurred while calling o83.save.
: java.lang.NoClassDefFoundError: com/mongodb/ConnectionString
at com.mongodb.spark.config.MongoCompanionConfig$$anonfun$4.apply(MongoCompanionConfig.scala:278)
at com.mongodb.spark.config.MongoCompanionConfig$$anonfun$4.apply(MongoCompanionConfig.scala:278)
at scala.util.Try$.apply(Try.scala:192)
at com.mongodb.spark.config.MongoCompanionConfig$class.connectionString(MongoCompanionConfig.scala:278)
at com.mongodb.spark.config.WriteConfig$.connectionString(WriteConfig.scala:37)
at com.mongodb.spark.config.WriteConfig$.apply(WriteConfig.scala:239)
at com.mongodb.spark.config.WriteConfig$.apply(WriteConfig.scala:37)
at com.mongodb.spark.config.MongoCompanionConfig$class.apply(MongoCompanionConfig.scala:124)
at com.mongodb.spark.config.WriteConfig$.apply(WriteConfig.scala:37)
at com.mongodb.spark.config.MongoCompanionConfig$class.apply(MongoCompanionConfig.scala:113)
at com.mongodb.spark.config.WriteConfig$.apply(WriteConfig.scala:37)
at com.mongodb.spark.sql.DefaultSource.createRelation(DefaultSource.scala:64)
at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:45)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:68)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:86)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:83)
at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:81)
at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:696)
at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:696)
at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:80)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:127)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:75)
at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:696)
at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:305)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:291)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
at java.lang.reflect.Method.invoke(Unknown Source)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Unknown Source)
Caused by: java.lang.ClassNotFoundException: com.mongodb.ConnectionString
at java.net.URLClassLoader.findClass(Unknown Source)
at java.lang.ClassLoader.loadClass(Unknown Source)
at sun.misc.Launcher$AppClassLoader.loadClass(Unknown Source)
at java.lang.ClassLoader.loadClass(Unknown Source)
... 43 more
请帮忙 谢谢,