我的输入文件看起来像
Item1,200,a,four,five,six,seven,eight1,nine1
Item2,500,b,four,five,six,seven,eight2,nine2
Item3,900,c,four,five,six,seven,eight3,nine3
Item2,800,d,four,five,six,seven,eight4,nine4
Item1,,e,four,five,six,seven,eight5,nine5
基于第一列的唯一值,我想结合所有其他列的唯一值。 到目前为止,我尝试过的是:
awk -F, '{
a[$1]=a[$1]?a[$1]"_"$2:$2;
b[$1]=b[$1]?b[$1]"_"$3:$3;
c[$1]=c[$1]?c[$1]"_"$4:$4;
d[$1]=d[$1]?d[$1]"_"$5:$5;
e[$1]=e[$1]?e[$1]"_"$6:$6;
f[$1]=f[$1]?f[$1]"_"$7:$7;
g[$1]=g[$1]?g[$1]"_"$8:$8;
h[$1]=h[$1]?h[$1]"_"$9:$9;
}END{for (i in a)print i, a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i];}' OFS=, input.txt
上面的输出是:
Item3,900,c,four,five,six,seven,eight3,nine3
Item1,200_,a_e,four_four,five_five,six_six,seven_seven,eight1_eight5,nine1_nine5
Item2,500_800,b_d,four_four,five_five,six_six,seven_seven,eight2_eight4,nine2_nine4
但是我期望的是:
Item3,900,c,four,five,six,seven,eight3,nine3
Item1,200,a_e,four,five,six,seven,eight1_eight5,nine1_nine5
Item2,500_800,b_d,four,five,six,seven,eight2_eight4,nine2_nine4
我正在寻求一些帮助:
非常感谢您的帮助。
答案 0 :(得分:2)
加上任何awk
加上sort
:
$ cat tst.awk
BEGIN { FS=OFS="," }
{
key = $1
keys[key]
for (i=2; i<=NF; i++) {
if ( ($i ~ /[^[:space:]]/) && (!seen[key,i,$i]++) ) {
idx = key FS i
vals[idx] = (idx in vals ? vals[idx] "_" : "") $i
}
}
}
END {
for (key in keys) {
printf "%s%s", key, OFS
for (i=2; i<=NF; i++) {
idx = key FS i
printf "%s%s", vals[idx], (i<NF ? OFS : ORS)
}
}
}
。
$ awk -f tst.awk file | sort -t, -k1,1
Item1,200,a_e,four,five,six,seven,eight1_eight5,nine1_nine5
Item2,500_800,b_d,four,five,six,seven,eight2_eight4,nine2_nine4
Item3,900,c,four,five,six,seven,eight3,nine3
或使用GNU awk
表示数组数组(有关两者的区别,请参见https://www.gnu.org/software/gawk/manual/gawk.html#Multidimensional和https://www.gnu.org/software/gawk/manual/gawk.html#Arrays-of-Arrays)和sorted_in
(请参见https://www.gnu.org/software/gawk/manual/gawk.html#Controlling-Array-Traversal和{ {3}}):
$ cat tst.awk
BEGIN { FS=OFS="," }
{
for ( i=2; i<=NF; i++ ) {
vals[$1][i][$i]
}
}
END {
PROCINFO["sorted_in"] = "@ind_str_asc"
for ( key in vals ) {
printf "%s%s", key, OFS
for ( i=2; i<=NF; i++ ) {
sep = ""
for ( val in vals[key][i] ) {
if ( val ~ /[^[:space:]]/ ) {
printf "%s%s", sep, val
sep = "_"
}
}
printf "%s", (i<NF ? OFS : ORS)
}
}
}
。
$ awk -f tst.awk file
Item1,200,a_e,four,five,six,seven,eight1_eight5,nine1_nine5
Item2,500_800,b_d,four,five,six,seven,eight2_eight4,nine2_nine4
Item3,900,c,four,five,six,seven,eight3,nine3
答案 1 :(得分:1)
编辑: :添加具有更合理的变量名的解决方案。
awk '
BEGIN{
FS=OFS=","
}
{
first_field_value[$1]
for(i=2;i<=NF;i++){
if($i!=""){
split(field_values[$1,i],temp_array,"_")
delete column_value
for(p in temp_array){
column_value[temp_array[p]]
}
if(!($i in column_value)){
(field_values[$1,i] == "" ? "" : field_values[$1,i] "_")$i
}
}
}
tot_field=tot_field>NF?tot_field:NF
}
END{
for(ind in first_field_value){
printf "%s,",ind;
for(j=2;j<=tot_field;j++){
printf("%s%s",field_values[ind,j],j==tot_field?ORS:OFS)
}
}
}
' Input_file
输出如下。
Item3,900,c,four,five,six,seven,eight3,nine3
Item1,200,a_e,four,five,six,seven,eight1_eight5,nine1_nine5
Item2,500_800,b_d,four,five,six,seven,eight2_eight4,nine2_nine4
说明: :这是我之前的代码的解释;变量名称不那么明智,但出于理解目的,仍可以阅读此说明。
awk ' ##Starting awk program from here.
BEGIN{ ##Starting BEGIN section.
FS=OFS="," ##Setting FS and OFS as comma here.
}
{
b[$1] ##Creating array b which has index $1, basically to keep track of $1 values as index here.
for(i=2;i<=NF;i++){ ##Running for loop from i=2 to till value of NF here.
if($i!=""){ ##Checking if any field is NOT NULL then do following.
num=split(c[$1,i],d," ") ##Splitting array c with index of $1,i and splitting its value to array d; it also saves number of elements in array d to variable num here.
for(p=1;p<=num;p++){ ##Running a for loop from p=1 to value of num.
e[d[p]] ##Creating array e whose index is value of array d which are actually values of fields and I am making sure duplicate values will NOT come by this array.
}
if(!($i in e)){ ##If current field is not present in array e then do following.
a[$1,i]=(a[$1,i]?a[$1,i] "_":"")$i ##Creating array a with index of $1,i and keep concatenating its value to it.
}
c[$1,i]=(c[$1,i]?c[$1,i] OFS:"")$i ##Creating array c with current field value and keep concatenating it; array c is the one which STOPS values to re-enter OR let us say it DO NOT allow duplicates values in array a.
}
}
tot_field=tot_field>NF?tot_field:NF ##Creating variable tot_field which will let us know till what value we need to run loop in END BLOCK of this code.
}
END{
for(k in b){ ##Starting a for loop which traverse through array b here.
printf "%s,",k; ##Printing its index here which is basically first field of all lines.
for(j=2;j<=tot_field;j++){ ##Running for loop till value of Maximum field value.
printf("%s%s",a[k,j],j==tot_field?ORS:OFS) ##Printing value of array a whose index is k and j where k is index of array b(1st field) and j is field number starts from 2.
}
}
}
' Input_file ##Mentioning Input_file name here.