查找并打印列值重复n次的行

时间:2018-09-23 04:25:20

标签: bash shell awk

我有一个文件:

import Foundation
import Alamofire

class AudioSyncManager {

    //var onDownloadStart: (()->())?
    var onDownloadFinished: ((_ isSuccess: Bool)->())?
    var onDownloadProgress: ((_ progress: Float)->())?

    static let shared = AudioSyncManager()

    private var downloadRequest: DownloadRequest?
    private var isDownloading = false

    var listData: [MainModel] = []

    func doDownloding(onStarted: @escaping ()->()) {

       if listData.count == 0 || isDownloading {
            return
       }

        let firstModel = listData.first
        if checkMp3FileExists(model: firstModel!) {

            self.isDownloading = false
            self.listData.removeFirst()

            if self.listData.count > 0 {
                self.doDownloding {}
            }

            return

        }

        let mp3URLString = MyHelper.MEDIA_URL_PREFIX + (firstModel?.link)!
        let url = URL(string: mp3URLString)

        let destination = DownloadRequest.suggestedDownloadDestination(for: FileManager.SearchPathDirectory.documentDirectory, in: FileManager.SearchPathDomainMask.userDomainMask)

        //isDownloading = true
        onStarted()

        downloadRequest = Alamofire.download(url!, method: .get, parameters: nil, encoding: JSONEncoding.default, headers: nil, to: destination)
            .downloadProgress { (progress) in

            self.onDownloadProgress?(Float(progress.fractionCompleted))

            }.response { (response) in

                self.isDownloading = false
                self.onDownloadFinished?(true)

                if self.listData.count > 0 {
                    self.listData.removeFirst()
                }

                if self.listData.count > 0 {
                    self.doDownloding{}
                }

        }

    }

    func addSingleTask(mainModel: MainModel) {

        listData.append(mainModel)
        doDownloding{}

    }

    func addListTask(newList: [MainModel]) {

        listData.append(contentsOf: newList)
        doDownloding{}

    }

}

,并且我想打印第三列中的项目重复了3次或更多次的行。以便删除这些行:

scaffold_0      11498
scaffold_0      11501
scaffold_0      11728   "RHOH"
scaffold_0      12144   "RHOH"
scaffold_0      20708   "RHOH"
scaffold_0      23579   "RHOH"
scaffold_0      130818
scaffold_0      200485  "NSUN7"
scaffold_0      209928  "NSUN7"
scaffold_0      212965  "NSUN7"
scaffold_0      214055  "APBB2"
scaffold_0      223404
scaffold_0      223686  "APBB2"
scaffold_0      227687  "APBB2"
scaffold_0      306105  "APBB2"
scaffold_0      307000  "APBB2"
scaffold_0      391742
scaffold_0      399332  "UCHL1"
scaffold_0      406726  "UCHL1"
scaffold_0      482215
scaffold_0      484921
scaffold_0      538855  "LIMCH1"
scaffold_0      539051  "LIMCH1"
scaffold_0      539819
scaffold_0      543347  "LIMCH1"
scaffold_0      568182  "LIMCH1"
scaffold_0      570321
scaffold_0      570325
scaffold_0      577502  "LIMCH1"
scaffold_0      578933  "LIMCH1"
scaffold_0      621330  "PHOX2B"
scaffold_0      623303  "PHOX2B"
scaffold_0      640271
scaffold_0      667510  "gene3"
scaffold_0      679096
scaffold_0      698659  "TMEM33"
scaffold_0      700427  "TMEM33"

我很高兴保留文件的顺序,并保留第三列为空的行。  我尝试过:

scaffold_0      399332  "UCHL1"
scaffold_0      406726  "UCHL1"
scaffold_0      621330  "PHOX2B"
scaffold_0      623303  "PHOX2B"
scaffold_0      667510  "gene3"
scaffold_0      698659  "TMEM33"
scaffold_0      700427  "TMEM33"

2 个答案:

答案 0 :(得分:1)

此awk将整个文件读取并散列到内存中

$ awk '{
    a[NR]=$0              # hash to a using record number as the key for order
    c[$3]++               # $3 counter
}
END {                     # after file records have been hashed
    for(i=1;i<=NR;i++) {  # iterate in order
        split(a[i],b)     # get the 3rd column
        if(c[b[3]]>=3)    # output if count is right
            print a[i]
    }
}' file

输出样本:

...
scaffold_0      306105  "APBB2"
scaffold_0      307000  "APBB2"
scaffold_0      391742
scaffold_0      482215
scaffold_0      484921
scaffold_0      538855  "LIMCH1"
scaffold_0      539051  "LIMCH1"
...

答案 1 :(得分:1)

$ awk 'NR==FNR{c[$3]++;next} c[$3]>2' file file
scaffold_0      11498
scaffold_0      11501
scaffold_0      11728   "RHOH"
scaffold_0      12144   "RHOH"
scaffold_0      20708   "RHOH"
scaffold_0      23579   "RHOH"
scaffold_0      130818
scaffold_0      200485  "NSUN7"
scaffold_0      209928  "NSUN7"
scaffold_0      212965  "NSUN7"
scaffold_0      214055  "APBB2"
scaffold_0      223404
scaffold_0      223686  "APBB2"
scaffold_0      227687  "APBB2"
scaffold_0      306105  "APBB2"
scaffold_0      307000  "APBB2"
scaffold_0      391742
scaffold_0      482215
scaffold_0      484921
scaffold_0      538855  "LIMCH1"
scaffold_0      539051  "LIMCH1"
scaffold_0      539819
scaffold_0      543347  "LIMCH1"
scaffold_0      568182  "LIMCH1"
scaffold_0      570321
scaffold_0      570325
scaffold_0      577502  "LIMCH1"
scaffold_0      578933  "LIMCH1"
scaffold_0      640271
scaffold_0      679096

$ awk 'NR==FNR{if ($3!="") c[$3]++;next} c[$3]>2' file file
scaffold_0      11728   "RHOH"
scaffold_0      12144   "RHOH"
scaffold_0      20708   "RHOH"
scaffold_0      23579   "RHOH"
scaffold_0      200485  "NSUN7"
scaffold_0      209928  "NSUN7"
scaffold_0      212965  "NSUN7"
scaffold_0      214055  "APBB2"
scaffold_0      223686  "APBB2"
scaffold_0      227687  "APBB2"
scaffold_0      306105  "APBB2"
scaffold_0      307000  "APBB2"
scaffold_0      538855  "LIMCH1"
scaffold_0      539051  "LIMCH1"
scaffold_0      543347  "LIMCH1"
scaffold_0      568182  "LIMCH1"
scaffold_0      577502  "LIMCH1"
scaffold_0      578933  "LIMCH1"