R - 删除矩阵中出现多次的术语

时间:2018-01-15 00:02:53

标签: r matrix words

我正在寻找使用R来提高短文本文档(推文)上LDA主题建模效果的方法。在预感中,我想删除不要使用的通用词。真正提到任何特定主题,这些主题往往出现在多个主题中。

我有一个包含10列和50行的矩阵,如下所示:

      Topic 1    Topic 2     Topic 3      Topic 4     Topic 5    Topic 6      Topic 7     Topic 8     Topic 9     
 [1,] "just"     "much"      "like"       "know"      "just"     "will"       "people"    "still"     "even"      
 [2,] "like"     "good"      "much"       "love"      "make"     "call"       "need"      "time"      "just"      
 [3,] "good"     "shit"      "good"       "back"      "time"     "like"       "just"      "love"      "tell"      
 [4,] "make"     "come"      "think"      "come"      "life"     "much"       "much"      "want"      "make"      
 [5,] "play"     "fuck"      "want"       "good"      "take"     "want"       "like"      "real"      "real"      
 [6,] "think"    "real"      "thank"      "ever"      "people"   "ever"       "think"     "life"      "good"      
 [7,] "right"    "even"      "find"       "night"     "start"    "love"       "know"      "make"      "think"     
 [8,] "fuck"     "make"      "little"     "every"     "work"     "feel"       "real"      "tell"      "hello"     
 [9,] "need"     "look"      "love"       "will"      "much"     "hello"      "time"      "will"      "know"      
[10,] "love"     "like"      "feel"       "keep"      "wish"     "happy"      "love"      "just"      "need"      
[11,] "every"    "friend"    "watch"      "take"      "week"     "never"      "year"      "hard"      "look"      
[12,] "gonna"    "thank"     "hate"       "tell"      "back"     "work"       "shit"      "fuck"      "want"      
[13,] "hard"     "talk"      "actual"     "sleep"     "think"    "real"       "wanna"     "today"     "friend"    
[14,] "girl"     "wanna"     "game"       "first"     "love"     "stop"       "good"      "live"      "keep"      
[15,] "game"     "want"      "change"     "actual"    "live"     "start"      "play"      "back"      "always"    
[16,] "life"     "think"     "everything" "work"      "right"    "last"       "look"      "take"      "thank"     
[17,] "come"     "will"      "year"       "today"     "night"    "friend"     "hello"     "watch"     "ain"       
[18,] "will"     "feel"      "shit"       "wait"      "many"     "nigga"      "give"      "good"      "like"      
[19,] "friend"   "just"      "someone"    "real"      "okay"     "today"      "feel"      "think"     "much"      
[20,] "bitch"    "girl"      "back"       "great"     "talk"     "think"      "back"      "stop"      "okay"      
[21,] "sure"     "always"    "something"  "something" "will"     "hope"       "take"      "year"      "right"     
[22,] "miss"     "miss"      "mean"       "watch"     "year"     "come"       "also"      "right"     "feel"      
[23,] "never"    "something" "many"       "shit"      "last"     "mind"       "follow"    "great"     "show"      
[24,] "also"     "first"     "make"       "please"    "fuck"     "okay"       "wait"      "feel"      "mean"      
[25,] "someone"  "tweet"     "take"       "much"      "show"     "make"       "girl"      "look"      "never"     
[26,] "wanna"    "many"      "everyone"   "wish"      "never"    "leave"      "never"     "always"    "leave"     
[27,] "wait"     "someone"   "also"       "always"    "know"     "hour"       "something" "hate"      "someone"   
[28,] "people"   "find"      "lose"       "never"     "good"     "people"     "talk"      "like"      "miss"      
[29,] "always"   "take"      "sleep"      "leave"     "today"    "hear"       "hard"      "never"     "happy"     
[30,] "feel"     "tell"      "hello"      "follow"    "please"   "right"      "still"     "sleep"     "nigga"     
[31,] "call"     "know"      "friend"     "show"      "look"     "lmao"       "hear"      "know"      "start"     
[32,] "look"     "happy"     "last"       "gonna"     "gonna"    "someone"    "miss"      "happy"     "last"      
[33,] "live"     "still"     "will"       "everyone"  "shit"     "keep"       "money"     "play"      "people"    
[34,] "ever"     "watch"     "life"       "hello"     "thank"    "tweet"      "actual"    "thank"     "wait"      
[35,] "give"     "hear"      "show"       "thank"     "even"     "everything" "mind"      "work"      "money"     
[36,] "tweet"    "life"      "still"      "lmao"      "hello"    "look"       "happen"    "someone"   "shit"      
[37,] "everyone" "sleep"     "call"       "want"      "feel"     "take"       "start"     "friend"    "still"     
[38,] "mind"     "little"    "hour"       "last"      "real"     "first"      "show"      "little"    "back"      
[39,] "real"     "never"     "ain"        "like"      "someone"  "little"     "always"    "gonna"     "gonna"     
[40,] "hope"     "leave"     "wish"       "okay"      "ain"      "game"       "right"     "hello"     "come"      
[41,] "lmao"     "please"    "work"       "happy"     "tweet"    "money"      "want"      "change"    "work"      
[42,] "happen"   "wait"      "right"      "girl"      "happen"   "show"       "bitch"     "also"      "everything"
[43,] "time"     "hello"     "need"       "think"     "hour"     "hate"       "tell"      "wait"      "give"      
[44,] "please"   "great"     "real"       "even"      "anything" "follow"     "hate"      "find"      "lmao"      
[45,] "still"    "nigga"     "hard"       "hate"      "always"   "live"       "gonna"     "hope"      "time"      
[46,] "week"     "hope"      "great"      "talk"      "bitch"    "always"     "change"    "talk"      "talk"      
[47,] "lose"     "hate"      "people"     "ain"       "mind"     "great"      "nigga"     "night"     "mind"      
[48,] "find"     "bitch"     "look"       "someone"   "girl"     "just"       "many"      "week"      "hour"      
[49,] "take"     "wish"      "always"     "many"      "stop"     "time"       "sure"      "something" "little"    
[50,] "hour"     "work"      "fuck"       "live"      "leave"    "night"      "watch"     "lmao"      "lose"      
      Topic 10  
 [1,] "know"    
 [2,] "time"    
 [3,] "will"    
 [4,] "hello"   
 [5,] "make"    
 [6,] "people"  
 [7,] "mean"    
 [8,] "year"    
 [9,] "need"    
[10,] "anything"
[11,] "want"    
[12,] "every"   
[13,] "love"    
[14,] "life"    
[15,] "good"    
[16,] "fuck"    
[17,] "stop"    
[18,] "gonna"   
[19,] "happen"  
[20,] "follow"  
[21,] "game"    
[22,] "nigga"   
[23,] "sure"    
[24,] "tweet"   
[25,] "great"   
[26,] "real"    
[27,] "show"    
[28,] "hope"    
[29,] "always"  
[30,] "miss"    
[31,] "money"   
[32,] "ain"     
[33,] "lose"    
[34,] "bitch"   
[35,] "tell"    
[36,] "keep"    
[37,] "call"    
[38,] "right"   
[39,] "take"    
[40,] "much"    
[41,] "never"   
[42,] "like"    
[43,] "shit"    
[44,] "sleep"   
[45,] "leave"   
[46,] "lmao"    
[47,] "everyone"
[48,] "play"    
[49,] "look"    
[50,] "watch" 

我想从他们出现的每一列中删除多列中列出的所有字词。每个主题可以包含不均匀数量的关键字。

P.S。:使用R对短文本使用LDA的任何其他相关见解也将受到赞赏。

1 个答案:

答案 0 :(得分:0)

删除出现在多个列中的单词==保留仅相对于其他列出现在单个列中的单词。

对于每列,相对于除X:

之外的所有列,取X列的setdiff
lapply(1:ncol(df),function(ix){
  a = as.character(df[,ix])
  b = as.character(unlist(df[,-ix,drop=F]))
  setdiff(a,b)
})

这会生成列表列表。

这将包括在一列中出现多次的单词,在所有其他列中出现零次。如果它需要是唯一的,那么还需要做一些额外的工作。

正如上述评论所述,目前,这会为问题中提供的单词矩阵产生一个空列表。