解决Spark资源问题

时间:2017-05-29 09:54:01

标签: apache-spark spark-dataframe

我遇到Spark设置问题。我得到的例外情况如下:

  

kernel.js:978 ui-logs-160> [2017年5月29日星期一12:44:45 GMT + 0300(FLE白天时间)] [org.apache.spark.scheduler.TaskSchedulerImpl]初始工作未接受任何资源;检查您的集群UI以确保工作人员已注册并具有足够的资源

在谷歌上搜索后,我发现从群集中询问的内核数量或内存数量太高而且我必须改变它。

我部署了一个迷你Spark群集,看起来像这样: enter image description here

我使用docker-compose,config file:

创建了它
version: '3'
services:
  master:
    image: gettyimages/spark
    command: bin/spark-class org.apache.spark.deploy.master.Master -h master
    hostname: master
    environment:
      MASTER: spark://master:7077
      SPARK_CONF_DIR: /conf
      SPARK_PUBLIC_DNS: localhost
    expose:
      - 7001
      - 7002
      - 7003
      - 7004
      - 7005
      - 7006
      - 7077
      - 6066
    ports:
      - 4040:4040
      - 6066:6066
      - 7077:7077
      - 8080:8080
    volumes:
      - ./conf/master:/conf
      - ./data/master:/tmp/data

  worker1:
    image: gettyimages/spark
    depends_on:
      - master
    command: bin/spark-class org.apache.spark.deploy.worker.Worker spark://master:7077
    hostname: worker
    environment:
      SPARK_CONF_DIR: /conf
      SPARK_WORKER_CORES: 2
      SPARK_WORKER_MEMORY: 1g
      SPARK_WORKER_PORT: 8881
      SPARK_WORKER_WEBUI_PORT: 8081
      SPARK_PUBLIC_DNS: localhost
    links:
      - master
    expose:
      - 7012
      - 7013
      - 7014
      - 7015
      - 7016
      - 8881
      - 8081
    ports:
      - 8081:8081
    volumes:
      - ./conf/worker1:/conf
      - ./data/worker1:/tmp/data

  worker2:
    image: gettyimages/spark
    depends_on:
      - master
    command: bin/spark-class org.apache.spark.deploy.worker.Worker spark://master:7077
    hostname: worker
    environment:
      SPARK_CONF_DIR: /conf
      SPARK_WORKER_CORES: 2
      SPARK_WORKER_MEMORY: 1g
      SPARK_WORKER_PORT: 8882
      SPARK_WORKER_WEBUI_PORT: 8082
      SPARK_PUBLIC_DNS: localhost
    links:
      - master
    expose:
      - 7012
      - 7013
      - 7014
      - 7015
      - 7016
      - 8882
      - 8082
    ports:
      - 8082:8082
    volumes:
      - ./conf/worker2:/conf
      - ./data/worker2:/tmp/data

  worker3:
    image: gettyimages/spark
    depends_on:
      - master
    command: bin/spark-class org.apache.spark.deploy.worker.Worker spark://master:7077
    hostname: worker
    environment:
      SPARK_CONF_DIR: /conf
      SPARK_WORKER_CORES: 2
      SPARK_WORKER_MEMORY: 1g
      SPARK_WORKER_PORT: 8883
      SPARK_WORKER_WEBUI_PORT: 8083
      SPARK_PUBLIC_DNS: localhost
    links:
      - master
    expose:
      - 7012
      - 7013
      - 7014
      - 7015
      - 7016
      - 8883
      - 8083
    ports:
      - 8083:8083
    volumes:
      - ./conf/worker3:/conf
      - ./data/worker3:/tmp/data

  worker4:
    image: gettyimages/spark
    depends_on:
      - master
    command: bin/spark-class org.apache.spark.deploy.worker.Worker spark://master:7077
    hostname: worker
    environment:
      SPARK_CONF_DIR: /conf
      SPARK_WORKER_CORES: 2
      SPARK_WORKER_MEMORY: 1g
      SPARK_WORKER_PORT: 8884
      SPARK_WORKER_WEBUI_PORT: 8084
      SPARK_PUBLIC_DNS: localhost
    links:
      - master
    expose:
      - 7012
      - 7013
      - 7014
      - 7015
      - 7016
      - 8884
      - 8084
    ports:
      - 8084:8084
    volumes:
      - ./conf/worker4:/conf
      - ./data/worker4:/tmp/data

  worker5:
    image: gettyimages/spark
    depends_on:
      - master
    command: bin/spark-class org.apache.spark.deploy.worker.Worker spark://master:7077
    hostname: worker
    environment:
      SPARK_CONF_DIR: /conf
      SPARK_WORKER_CORES: 2
      SPARK_WORKER_MEMORY: 1g
      SPARK_WORKER_PORT: 8885
      SPARK_WORKER_WEBUI_PORT: 8085
      SPARK_PUBLIC_DNS: localhost
    links:
      - master
    expose:
      - 7012
      - 7013
      - 7014
      - 7015
      - 7016
      - 8885
      - 8085
    ports:
      - 8085:8085
    volumes:
      - ./conf/worker5:/conf
      - ./data/worker5:/tmp/data

我尝试使用Spark Notebook

查询数据

My Spark Conf如下:

{
  "spark.app.name": "Dockerbook",
  "spark.master": "spark://localhost:7077",
  "spark.executor.memory": "512m",
  "spark.cores.max": "5"
}

我分配的资源比集群少一半,但我的代码仍然无法运行。

我运行的代码非常简单:

val spark = SparkSession
  .builder
  .getOrCreate

import spark.implicits._

val country = spark.read
  .format("jdbc")
  .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver")
  .option("url", "jdbc:sqlserver://mssql-host;database=database;")
  .option("dbtable", "dbo.Country")
  .option("user", "bestuserever")
  .option("password", "mostsecurepassword")
  .load()
  .select($"CountryID", $"CountryName")
  .cache()

但是我最终得到了同样的警告。我该如何解决?

0 个答案:

没有答案