创建计算"级别的变量"其他变量

时间:2017-02-23 17:23:48

标签: sas levels

我有一个类似于下面简化表格的数据集(让我们称之为" DS_have"):

var array = [
  { id: "cards", amount: 5 }, 
  { id: "shirts", amount: 3 },
  { id: "cards", amount: 2 }, 
  { id: "shirts", amount: 3 }
];

var result = array.reduce(function(entities, item) {
       entities[item.id] = (entities[item.id] || 0) + item.amount;
       return entities;
}, {})


console.log(result);

我想创建一组数字变量来识别上面数据集中每个变量的离散类别/级别。结果应类似于以下数据集(" DS_want"):

SurveyID    Participant FavoriteColor   FavoriteFood    SurveyMonth
S101        G92         Blue            Pizza           Jan
S102        B34         Blue            Cake            Feb
S103        Z28         Green           Cake            Feb
S104        V11         Red             Cake            Feb
S105        P03         Yellow          Pizza           Mar
S106        A71         Red             Pizza           Mar
S107        C48         Green           Cake            Mar
S108        G92         Blue            Cake            Apr
...

基本上,我想知道我应该使用什么语法为每个"级别生成唯一的数值"或 DS_Have 数据集中的变量类别。请注意,我不能使用条件if / then语句在":Levels"中创建值。每个类别的变量,因为某些变量的级别数量为数千个。

2 个答案:

答案 0 :(得分:2)

一个简单的解决方案是使用proc tabulate生成表格列表,然后对其进行迭代并创建信息以将文本转换为数字;那么你只需使用input对它们进行编码。

*store variables you want to work with in a macro variable to make this easier;
%let vars=FavoriteColor FavoriteFood SurveyMonth;

*run a tabulate to get the unique values;
proc tabulate data=have out=freqs;
  class &vars.;
  tables (&vars.),n;
run;

*if you prefer to have this in a particular order, sort by that now - otherwise you may have odd results (as this will).  Sort by _TYPE_ then your desired order.;


*Now create a dataset to read in for informat.;
data for_fmt;
  if 0 then set freqs;
  array vars &vars.;
  retain type 'i';
  do label = 1 by 1 until (last._type_);  *for each _type_, start with 1 and increment by 1;
    set freqs;
    by _type_ notsorted;
    which_var = find(_type_,'1');  *parses the '100' value from TYPE to see which variable this row is doing something to.  May not work if many variables - need another solution to identify which (depends on your data what works);

    start = coalescec(vars[which_var]);
    fmtname = cats(vname(vars[which_var]),'I');
    output;
    if first._type_ then do; *set up what to do if you encounter a new value not coded - set it to missing;
      hlo='o';  *this means OTHER;
      start=' ';
      label=.;
      output;
      hlo=' ';
      label=1;
    end;
  end;
run;

proc format cntlin=for_fmt;  *import to format catalog via PROC FORMAT;
quit;

然后像这样对它们进行编码(你可以创建一个宏来对& vars宏变量进行循环)。

data want;
  set have;
  color_code = input(FavoriteColor,FavoriteColorI.);
run;

答案 1 :(得分:0)

另一种方法 - 创建一个哈希对象来跟踪每个变量遇到的级别,并通过双DOW循环读取数据集两次,在第二次传递时应用级别编号。它可能不如Joe的解决方案那么优雅,但它应该使用稍微少一点的内存,我怀疑它会扩展到更多的变量。

%macro levels_rename(DATA,OUT,VARS,NEWVARS);
    %local i NUMVARS VARNAME;

    data &OUT;
    if 0 then set &DATA;
    length LEVEL 8;
    %let i = 1;
    %let VARNAME = %scan(&VARS,&i);
    %do %while(&VARNAME ne );
        declare hash h&i();
        rc = h&i..definekey("&VARNAME");
        rc = h&i..definedata("LEVEL");
        rc = h&i..definedone();
      %let i = %eval(&i + 1);
      %let VARNAME = %scan(&VARS,&i);
    %end;
    %let NUMVARS = %eval(&i - 1);
    do _n_ = 1 by 1 until(eof);
        set &DATA end = eof;
      %do i = 1 %to &NUMVARS;
        LEVEL = h&i..num_items + 1;
        rc = h&i..add();
      %end;
    end;
    do _n_ = 1 to _n_;
      set &DATA;
      %do i = 1 %to &NUMVARS;
        rc = h&i..find();
        %scan(&NEWVARS,&i) = LEVEL;
      %end;
      output;
    end;
    drop LEVEL;
    run;
%mend;

%levels_rename(sashelp.class,class_renamed,NAME SEX, NAME_L SEX_L);