我有一个包含类似记录的文件,
输入(原始记录):
50.57.190.149--[22 / Apr / 2012:07:12:41 +0530]“ GET /computers/laptops.html?brand=819 HTTP / 1.0” 200 12530“-”“-”
输出(已处理的日志记录):
50.57.190.149--22 / Apr / 2012:07:12:41 +0530 GET /computers/laptops.html?brand=819 HTTP / 1.0计算机--laptops.html 品牌= 819 200 12530--
输入数据的格式:
这是我的代码:
object unStructuredToStructured {
def main(args : Array[String]){
val spark = SparkSession.builder().appName("unStructuredToStructured").master("local[*]").getOrCreate()
import spark.implicits._
val rdd1 = spark.read.textFile("C:\\Users\\LENOVO\\Downloads\\Veeresh_study\\DataSet_from_OldSessions\\weblogs\\weblogs_1_rec.txt").rdd
val schemaString = "remote_IP remote_log_name user time request_string status_code byte_string user_agent referral"
val fields = schemaString.split(" ")
.map(fieldName => StructField(fieldName, StringType, nullable = true))
val schema = StructType(fields)
val rowRDD = rdd1.map(x => x.split(" "))
.map(attributes => Row(attributes(0), attributes(1), attributes(2), attributes(3), attributes(4), attributes(5), attributes(6), attributes(7), attributes(8)))
val data = spark.createDataFrame(rowRDD, schema)
data.show()
}
}
输出:
这是我得到的输出
从图像中可以看到,
我们使用 space 作为Delemeter,并将一个字段的值拆分为多列(因为字段值中包含 space )
ex:理想情况下,“ 时间”列的值应为“ [22 / Apr / 2012:07:12:41 +0530] ”,但在这里分为两列,即“ 时间”和“ request_string ”
类似地, request_string (“ GET /computers/laptops.html?brand=819 HTTP / 1.0”)的值分为“ 状态代码”,“ byte_string ”和“ user_agent ”
请忽略字段值中的空格,以帮助解析字段值
答案 0 :(得分:0)
经过多次试验找到了解决方案,显然可以改进以下解决方案。
object unStructuredToStructured {
def main(args : Array[String]){
val spark = SparkSession.builder().appName("unStructuredToStructured").master("local[*]").getOrCreate()
import spark.implicits._
val rdd1 = spark.read.textFile("C:\\Users\\LENOVO\\Downloads\\Veeresh_study\\DataSet_from_OldSessions\\weblogs\\weblogs_10_lakh_rec.txt").rdd
val schemaString = "remote_IP remote_log_name user time request_string status_code byte_string user_agent referral"
val fields = schemaString.split(" ")
.map(fieldName => StructField(fieldName, StringType, nullable = true))
val schema = StructType(fields)
val rowRDD = rdd1.map(x => x.split(" "))
.map(attributes =>
Row(attributes(0), attributes(1), attributes(2), attributes(3), attributes(4), attributes(5), attributes(6), attributes(7), attributes(8))
)
// rowRDD.foreach(println)
def combiner(arr : Array[String]) = {
val len = arr.length
if (len <= 15)
{
var val0 = arr(0)
var val1 = arr(1)
var val2 = arr(2)
var val3 = arr(3).concat(arr(4))
var val4 = arr(5).concat(arr(6)).concat(arr(7))
var val5 = arr(8)
var val6 = arr(9)
var last = arr.last
var value : String = null
for(i <- 10 until len-1) {
if(value == null)
value = arr(i)
else
value = value.concat(arr(i))
}
Row(val0, val1, val2,val3,val4,val5,val6,value,last)
}
else
{
var val0 = arr(0)
var val1 = arr(1)
var val2 = arr(2)
var val3 = arr(3).concat(arr(4))
var val4 = arr(5).concat(arr(6)).concat(arr(7))
var val5 = arr(8)
var val6 = arr(9)
var val7 = arr(10).concat(arr(11)).concat(arr(12)).concat(arr(13))
.concat(arr(14)).concat(arr(15)).concat(arr(16)).concat(arr(17)).concat(arr(17)).concat(arr(18)).concat(arr(19))
var val8 = arr(20)
var last = arr.last
//val len1 = arr.length
var value : String = null
for(i <- 10 until len-1) {
if(value == null)
value = arr(i)
else
value = value.concat(arr(i))
}
// Row(len,val0, val1, val2,val3,val4,val5,val6,val7,val8)
Row(val0, val1, val2,val3,val4,val5,val6,value,last)
}
}
val rowRDD1 = rdd1.map(x => x.split(" "))
.map{attributes =>
combiner(attributes)
}
rowRDD1.foreach(println)
val data = spark.createDataFrame(rowRDD1, schema)
data.show()
}
}
,这是o / p的屏幕截图: Here is the output