如何将数据字符串转换为多个列名

时间:2018-11-11 12:51:50

标签: r dataframe

在此先感谢您的帮助。我有几千行,需要进行归一化,以便可以进行分析。 数据结构-

  

第1行-{'gender':'Male','国籍':'POL','document_type':   'national_identity_card','date_of_expiry':'2024-07-21',   'issuing_country':'POL'}

     

第2行-{'gender':'Female','nationality':'LTU','document_type':   'national_identity_card','date_of_expiry':'2023-06-27',   'issuing_country':'LTU'}

     

第3行{'document_type':'driving_licence','date_of_expiry':   '2044-12-14','issuing_country':'GRC'}

     

第4行{'gender':'Male','document_type':'driving_licence',   'date_of_expiry':'2024-08-05','issuing_country':'GBR'}

我想要 (1)性别为上校名称,男性或女性为价值 (2)国籍为上校名称 (3)Document_type作为列名 (4)到期日为列名 (5)发出国家/地区作为列名

请注意,每个原始元素可能具有或不具有全部元素。

预先感谢您的帮助。

1 个答案:

答案 0 :(得分:1)

我们假设您有几千行,如下所示:

{'gender': 'Male', 'nationality': 'POL', 'document_type': 'national_identity_card', 'date_of_expiry': '2024-07-21', 'issuing_country': 'POL'}
{'gender': 'Female', 'nationality': 'LTU', 'document_type': 'national_identity_card', 'date_of_expiry': '2023-06-27', 'issuing_country': 'LTU'}
{'document_type': 'driving_licence', 'date_of_expiry': '2044-12-14', 'issuing_country': 'GRC'}
{'gender': 'Male', 'document_type': 'driving_licence', 'date_of_expiry': '2024-08-05', 'issuing_country': 'GBR'}

这是无效的常规JSON,无效的nsjdon和无效的javascript对象语法。

一种处理方法是通过将单引号替换为双引号将其转换为有效的ndjson:

library(magrittr)
library(ndjson)
library(stringi)

readLines(textConnection("{'gender': 'Male', 'nationality': 'POL', 'document_type': 'national_identity_card', 'date_of_expiry': '2024-07-21', 'issuing_country': 'POL'}
{'gender': 'Female', 'nationality': 'LTU', 'document_type': 'national_identity_card', 'date_of_expiry': '2023-06-27', 'issuing_country': 'LTU'}
{'document_type': 'driving_licence', 'date_of_expiry': '2044-12-14', 'issuing_country': 'GRC'}
{'gender': 'Male', 'document_type': 'driving_licence', 'date_of_expiry': '2024-08-05', 'issuing_country': 'GBR'}")) %>% 
  stri_replace_all_regex("([^\\\\])'", '$1"') %>% # replace non-escaped single quotes with double quotes
  ndjson::flatten("tbl") # turn the character vector of now valid
## # A tibble: 4 x 5
##   date_of_expiry document_type         gender issuing_country nationality
##   <chr>          <chr>                 <chr>  <chr>           <chr>      
## 1 2024-07-21     national_identity_ca… Male   POL             POL        
## 2 2023-06-27     national_identity_ca… Female LTU             LTU        
## 3 2044-12-14     driving_licence       NA     GRC             NA         
## 4 2024-08-05     driving_licence       Male   GBR             NA 

另一种方法是使用V8包,因为在javascript领域中,我们可以使用eval(),无论出于何种原因,该library(magrittr) library(data.table) library(V8) ctx <- v8() readLines(textConnection("{'gender': 'Male', 'nationality': 'POL', 'document_type': 'national_identity_card', 'date_of_expiry': '2024-07-21', 'issuing_country': 'POL'} {'gender': 'Female', 'nationality': 'LTU', 'document_type': 'national_identity_card', 'date_of_expiry': '2023-06-27', 'issuing_country': 'LTU'} {'document_type': 'driving_licence', 'date_of_expiry': '2044-12-14', 'issuing_country': 'GRC'} {'gender': 'Male', 'document_type': 'driving_licence', 'date_of_expiry': '2024-08-05', 'issuing_country': 'GBR'}")) %>% lapply(function(line) { ctx$eval(sprintf("var line = eval('(' + \"%s\" +')');", line)) ctx$get("line") }) %>% data.table::rbindlist(fill=TRUE) %>% as.data.frame() ## gender nationality document_type date_of_expiry issuing_country ## 1 Male POL national_identity_card 2024-07-21 POL ## 2 Female LTU national_identity_card 2023-06-27 LTU ## 3 <NA> <NA> driving_licence 2044-12-14 GRC ## 4 Male <NA> driving_licence 2024-08-05 GBR 对于指定javascript对象的方式都不太挑剔:

//
//  GameScene.swift
//  Flarrow
//
//  Created by Денис Андрейчук on 11/9/18.
//  Copyright © 2018 Денис Андрейчук. All rights reserved.
//

import SpriteKit

class GameScene: SKScene {
    //For state control
    enum State {
        case moveUp
        case moveLeft
        case moveRight
    }

    var currentState = State.moveUp
    var pathArray = [CGPoint]()
    let line = SKShapeNode()

    override func didMove(to view: SKView) {
        self.backgroundColor = .gray

        currentState = .moveUp

        pathArray.append(CGPoint(x: 0, y: -UIScreen.main.bounds.height))
        pathArray.append(CGPoint(x: 0, y: -UIScreen.main.bounds.height))
        createLine()
    }

    //When user touch on screen, check for touch position, change state based on that
    //and add duplicate of current last point
    override func touchesBegan(_ touches: Set<UITouch>, with event: UIEvent?) {
        let touch = touches.first!
        let location = touch.location(in: self)

        if location.x > 0 {
            currentState = .moveRight
            pathArray.append(pathArray[pathArray.endIndex - 1])
        } else {
            currentState = .moveLeft
            pathArray.append(pathArray[pathArray.endIndex - 1])
        }
    }

    //Init line
    func createLine() {
        let path = CGMutablePath()
        path.move(to: pathArray[0])

        for point in pathArray {
            path.addLine(to: point)
        }

        line.path = path
        line.fillColor = .clear
        line.lineWidth = 5
        line.strokeColor = .red

        self.addChild(line)
    }

    //Update last point possition based on current state
    override func update(_ currentTime: TimeInterval) {
        let path = CGMutablePath()
        path.move(to: pathArray[0])

        switch currentState {
        case .moveUp:
            pathArray[1].y += 1
        case .moveLeft:
            pathArray[pathArray.endIndex - 1].y += 1
            pathArray[pathArray.endIndex - 1].x -= 1
        case .moveRight:
            pathArray[pathArray.endIndex - 1].y += 1
            pathArray[pathArray.endIndex - 1].x += 1
        }

        for point in pathArray {
            path.addLine(to: point)
        }

        line.path = path

    }
}

由于我们对您的数据一无所知,因此两者都是脆弱的操作。即是否有可能将单引号转义?是否有嵌入双引号的机会?这是您的数据使用的格式吗?

请注意,V8很难在某些系统上运行,但是如果无法在您的系统上安装V8,请不要对此发表评论。