Assign Country to IP addresses

时间:2018-03-25 20:28:08

标签: scala apache-spark apache-spark-sql

I am trying to join two table to get the country code and ip address and have also attached table screenshot. How to join these two table? I am using Zeppelin

enter image description here

enter image description here

def ipToLong(dottedIP: String): Long = {
  val addrArray: Array[String] = dottedIP.split("\\.")
  var num: Long = 0
  var i: Int = 0
  while (i < addrArray.length) {
    val power: Int = 3 - i
    num = num + ((addrArray(i).toInt % 256) * Math.pow(256, power)).toLong
    i += 1
  }
  num
}

val rdd1 = sc.textFile("/user/mamta/mamta_audit/mamta_audit.csv")
case class IPCode(Date_key:String,LogID:String,Activity:String,SourceIP:String)
val sal1 = rdd1.map(_.split(",")).map(i => IPCode(i(0),i(1),i(2),i(3))).toDF("Date_key","LogID","Activity","SourceIP")



val rdd2 = sc.textFile("/user/mamta/IP_LocationCode.csv")
case class IPLoc(ip_from:String,
 ip_to:String,
Country_Code:String,
 Region_Name:String,
 City_Name:String
 )
 val sal2 = rdd2.map(_.split(",")).map(e => IPLoc(e(0),e(1),e(2),e(3),e(4))).toDF("ip_from","ip_to","Country_Code","Region_Name","City_Name")

I tried this but its giving me null value for ip_from, ip_to, countrycode

sal1.join(sal2,
     sal1("SourceIP") >= sal2("ip_from") && sal1("SourceIP") <= sal2("ip_to"), 
    "left"
).show()

1 个答案:

答案 0 :(得分:1)

您的ipToLong方法需要转换为UDF才能应用于加入条件中的IP列,如下所示:

val sal1 = Seq(
  ("109.175.191.0"),
  ("invalid.ip"),
  ("187.42.62.209"),
  ("89.142.219.5")
).toDF("SourceIP")

val sal2 = Seq(
  ("75.0.0.0", "89.255.255.255", "Country A"),
  ("90.0.0.0", "129.255.255.255", "Country B"),
  ("130.0.0.0", "199.255.255.255", "Country C"),
  ("bad.ip", "bad.ip", "Country Z")
).toDF("ip_from", "ip_to", "country")

import org.apache.spark.sql.functions._

def ipToLongUDF = udf(
  (ip: String) => {
    val patternIPv4 = """\s*\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\s*""".r
    ip match {
      case patternIPv4() => ip.split("\\.").reverse.zipWithIndex.map(
          a => a._1.toInt * math.pow(256, a._2).toLong
        ).sum
      case _ => -1L
    }
  }
)

sal1.join(
    sal2,
    ipToLongUDF(sal1("SourceIP")) >= 0 &&
      ipToLongUDF(sal1("SourceIP")) >= ipToLongUDF(sal2("ip_from")) && 
      ipToLongUDF(sal1("SourceIP")) <= ipToLongUDF(sal2("ip_to")),
    "left"
  ).
  show
// +-------------+---------+---------------+---------+
// |     SourceIP|  ip_from|          ip_to|  country|
// +-------------+---------+---------------+---------+
// |109.175.191.0| 90.0.0.0|129.255.255.255|Country B|
// |   invalid.ip|     null|           null|     null|
// |187.42.62.209|130.0.0.0|199.255.255.255|Country C|
// | 89.142.219.5| 75.0.0.0| 89.255.255.255|Country A|
// +-------------+---------+---------------+---------+