awk根据单个列的唯一值组合其他列的唯一值

时间:2020-03-29 10:05:27

标签: csv awk

我的输入文件看起来像

Item1,200,a,four,five,six,seven,eight1,nine1
Item2,500,b,four,five,six,seven,eight2,nine2
Item3,900,c,four,five,six,seven,eight3,nine3
Item2,800,d,four,five,six,seven,eight4,nine4
Item1,,e,four,five,six,seven,eight5,nine5

基于第一列的唯一值,我想结合所有其他列的唯一值。 到目前为止,我尝试过的是:

awk -F, '{
a[$1]=a[$1]?a[$1]"_"$2:$2;
b[$1]=b[$1]?b[$1]"_"$3:$3;
c[$1]=c[$1]?c[$1]"_"$4:$4;
d[$1]=d[$1]?d[$1]"_"$5:$5;
e[$1]=e[$1]?e[$1]"_"$6:$6;
f[$1]=f[$1]?f[$1]"_"$7:$7;
g[$1]=g[$1]?g[$1]"_"$8:$8;
h[$1]=h[$1]?h[$1]"_"$9:$9;
}END{for (i in a)print i, a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i];}' OFS=, input.txt

上面的输出是:

Item3,900,c,four,five,six,seven,eight3,nine3
Item1,200_,a_e,four_four,five_five,six_six,seven_seven,eight1_eight5,nine1_nine5
Item2,500_800,b_d,four_four,five_five,six_six,seven_seven,eight2_eight4,nine2_nine4

但是我期望的是:

Item3,900,c,four,five,six,seven,eight3,nine3
Item1,200,a_e,four,five,six,seven,eight1_eight5,nine1_nine5
Item2,500_800,b_d,four,five,six,seven,eight2_eight4,nine2_nine4

我正在寻求一些帮助:

  1. 如何在组合值时仅采用唯一值?
  2. 每当出现空白值时,合并时是否不应该在末尾添加定界符(在我的情况下为下划线)?
  3. 如何根据列1的值对输出进行排序?

非常感谢您的帮助。

2 个答案:

答案 0 :(得分:2)

加上任何awk加上sort

$ cat tst.awk
BEGIN { FS=OFS="," }
{
    key = $1
    keys[key]
    for (i=2; i<=NF; i++) {
        if ( ($i ~ /[^[:space:]]/) && (!seen[key,i,$i]++) ) {
            idx = key FS i
            vals[idx] = (idx in vals ? vals[idx] "_" : "") $i
        }
    }
}
END {
    for (key in keys) {
        printf "%s%s", key, OFS
        for (i=2; i<=NF; i++) {
            idx = key FS i
            printf "%s%s", vals[idx], (i<NF ? OFS : ORS)
        }
    }
}

$ awk -f tst.awk file | sort -t, -k1,1
Item1,200,a_e,four,five,six,seven,eight1_eight5,nine1_nine5
Item2,500_800,b_d,four,five,six,seven,eight2_eight4,nine2_nine4
Item3,900,c,four,five,six,seven,eight3,nine3

或使用GNU awk表示数组数组(有关两者的区别,请参见https://www.gnu.org/software/gawk/manual/gawk.html#Multidimensionalhttps://www.gnu.org/software/gawk/manual/gawk.html#Arrays-of-Arrays)和sorted_in(请参见https://www.gnu.org/software/gawk/manual/gawk.html#Controlling-Array-Traversal和{ {3}}):

$ cat tst.awk
BEGIN { FS=OFS="," }
{
    for ( i=2; i<=NF; i++ ) {
        vals[$1][i][$i]
    }
}
END {
    PROCINFO["sorted_in"] = "@ind_str_asc"
    for ( key in vals ) {
        printf "%s%s", key, OFS
        for ( i=2; i<=NF; i++ ) {
            sep = ""
            for ( val in vals[key][i] ) {
                if ( val ~ /[^[:space:]]/ ) {
                    printf "%s%s", sep, val
                    sep = "_"
                }
            }
            printf "%s", (i<NF ? OFS : ORS)
        }
    }
}

$ awk -f tst.awk file
Item1,200,a_e,four,five,six,seven,eight1_eight5,nine1_nine5
Item2,500_800,b_d,four,five,six,seven,eight2_eight4,nine2_nine4
Item3,900,c,four,five,six,seven,eight3,nine3

答案 1 :(得分:1)

编辑: :添加具有更合理的变量名的解决方案。

awk '
BEGIN{
  FS=OFS=","
}
{
  first_field_value[$1]
  for(i=2;i<=NF;i++){
    if($i!=""){
      split(field_values[$1,i],temp_array,"_")
      delete column_value
      for(p in temp_array){
        column_value[temp_array[p]]
      }
      if(!($i in column_value)){
        (field_values[$1,i] == "" ? "" : field_values[$1,i] "_")$i
      }
    }
  }
  tot_field=tot_field>NF?tot_field:NF
}
END{
  for(ind in first_field_value){
    printf "%s,",ind;
    for(j=2;j<=tot_field;j++){
      printf("%s%s",field_values[ind,j],j==tot_field?ORS:OFS)
    }
  }
}
'  Input_file

输出如下。

Item3,900,c,four,five,six,seven,eight3,nine3
Item1,200,a_e,four,five,six,seven,eight1_eight5,nine1_nine5
Item2,500_800,b_d,four,five,six,seven,eight2_eight4,nine2_nine4

说明: :这是我之前的代码的解释;变量名称不那么明智,但出于理解目的,仍可以阅读此说明。

awk '                                          ##Starting awk program from here.
BEGIN{                                         ##Starting BEGIN section.
  FS=OFS=","                                   ##Setting FS and OFS as comma here.
}
{
  b[$1]                                        ##Creating array b which has index $1, basically to keep track of $1 values as index here.
  for(i=2;i<=NF;i++){                          ##Running for loop from i=2 to till value of NF here.
    if($i!=""){                                ##Checking if any field is NOT NULL then do following.
      num=split(c[$1,i],d," ")                 ##Splitting array c with index of $1,i and splitting its value to array d; it also saves number of elements in array d to variable num here.
      for(p=1;p<=num;p++){                     ##Running a for loop from p=1 to value of num.
        e[d[p]]                                ##Creating array e whose index is value of array d which are actually values of fields and I am making sure duplicate values will NOT come by this array.
      }
      if(!($i in e)){                          ##If current field is not present in array e then do following.
        a[$1,i]=(a[$1,i]?a[$1,i] "_":"")$i     ##Creating array a with index of $1,i and keep concatenating its value to it.
      }
      c[$1,i]=(c[$1,i]?c[$1,i] OFS:"")$i       ##Creating array c with current field value and keep concatenating it; array c is the one which STOPS values to re-enter OR let us say it DO NOT allow duplicates values in array a.
    }
  }
  tot_field=tot_field>NF?tot_field:NF          ##Creating variable tot_field which will let us know till what value we need to run loop in END BLOCK of this code.
}
END{
  for(k in b){                                     ##Starting a for loop which traverse through array b here.
    printf "%s,",k;                                ##Printing its index here which is basically first field of all lines.
    for(j=2;j<=tot_field;j++){                     ##Running for loop till value of Maximum field value.
      printf("%s%s",a[k,j],j==tot_field?ORS:OFS)   ##Printing value of array a whose index is  k and j where k is index of array b(1st field) and j is field number starts from 2.
    }
  }
}
'  Input_file                                      ##Mentioning Input_file name here.
相关问题