寻找更好的方法来可视化R和ggplot2中的分布

时间:2016-02-21 12:52:25

标签: r ggplot2

我希望可视化以下数据:酒店观察到每年的部分客户都是回头客。因此,每年约有一半的客户是拳头客户,20%是第二次客户,依此类推。下面是一些包含数据和可视化的R代码。但是,我对此并不满意,而且我正在寻求改进:

  • R不喜欢有很多颜色的色带 - 所以也许是组数据?
  • 步骤曲线是否会更好地显示?
  • 访问次数被视为一个因素 - 这是正确的方法吗?

  • 堆叠栏可以轻松比较第一次来宾,而不是其他客人。我应该选择不同的可视化吗?

    #! /usr/bin/env R CMD BATCH
    
    library(ggplot2)
    
    d <- read.table(header=TRUE, text='
        year visit count
        2013 1 1641
        2013 2 604
        2013 3 256
        2013 4 89
        2013 5 32
        2013 6 10
        2013 7 4
        2013 8 3
        2014 1 1365
        2014 2 637
        2014 3 276
        2014 4 154
        2014 5 86
        2014 6 39
        2014 7 19
        2014 8 6
        2014 9 4
        2014 10 2
        2014 11 1
        2014 12 1
        2015 1 1251
        2015 2 608
        2015 3 288
        2015 4 143
        2015 5 88
        2015 6 52
        2015 7 21
        2015 8 8
        2015 9 8
        2015 10 3
        2015 11 2
        2015 12 1')
    
    d$year  <- factor(d$year)
    d$visit <- factor(d$visit)
    
    p <- ggplot(d, aes(year,count))
    p <- p + geom_bar(aes(fill=visit),position="fill",stat="identity")
    p <- p + xlab("Year") + ylab("Distribution")
    # pdf("returners.pdf",9,6)
    print(p)
    # dev.off()
    

enter image description here

2 个答案:

答案 0 :(得分:3)

为什么不将它们视为实际分布?

p <- ggplot(d, aes(visit, count))
p <- p + geom_bar(stat="identity", width=0.75)
p <- p + scale_x_discrete(expand=c(0,0))
p <- p + scale_y_continuous(expand=c(0,0))
p <- p + facet_wrap(~year)
p <- p + labs(x=NULL, y="Visits")
p <- p + ggthemes::theme_tufte(base_family="Helvetica") 
p <- p + theme(legend.position="none")
p <- p + theme(panel.grid=element_line(color="#2b2b2b", size=0.15))
p <- p + theme(panel.grid.minor=element_blank())
p <- p + theme(panel.grid.major.x=element_blank())
p <- p + theme(axis.ticks=element_blank())
p <- p + theme(strip.text=element_text(hjust=0))
p <- p + theme(panel.margin.x=unit(1, "cm"))
p

enter image description here

要按年份查看访问次数增量,您只需交换构面:

d$year  <- factor(d$year)
d$visit <- sprintf("Visit: %d", d$visit)
d$visit <- factor(d$visit, levels=unique(d$visit))

p <- ggplot(d, aes(year, count))
p <- p + geom_segment(aes(xend=year, yend=0), size=0.3)
p <- p + geom_point()
p <- p + scale_x_discrete(expand=c(0, 0.25))
p <- p + scale_y_continuous(label=scales::comma)
p <- p + facet_wrap(~visit, scales="free_y")
p <- p + labs(x="NOTE: Free y-axis scale", y="Count")
p <- p + ggthemes::theme_tufte(base_family="Helvetica") 
p <- p + theme(legend.position="none")
p <- p + theme(panel.grid=element_line(color="#2b2b2b", size=0.15))
p <- p + theme(panel.grid.minor=element_blank())
p <- p + theme(panel.grid.major.x=element_blank())
p <- p + theme(axis.ticks=element_blank())
p <- p + theme(strip.text=element_text(hjust=0))
p <- p + theme(panel.margin=unit(1.5, "cm"))
p

enter image description here

或者,您可以通过访问(%)来查看同比增长:

library(dplyr)

group_by(d, visit) %>% 
  arrange(year) %>% 
  mutate(lag=lag(count),
         chg_pct=(count-lag)/lag,
         chg_pct=ifelse(is.na(chg_pct), 0, chg_pct),
         pos=as.character(sign(chg_pct))) -> d

p <- ggplot(d, aes(year, chg_pct))
p <- p + geom_hline(yintercept=0, color="#2b2b2b", size=0.5)
p <- p + geom_segment(aes(xend=year, yend=0, color=pos), size=0.3)
p <- p + geom_point(aes(color=pos))
p <- p + scale_x_discrete(expand=c(0, 0.25))
p <- p + scale_y_continuous(label=scales::percent)
p <- p + scale_color_manual(values=c("#b2182b", "#878787", "#7fbc41"))
p <- p + facet_wrap(~visit, scales="free_y")
p <- p + labs(x="NOTE: free y-axis", y="YoY % Difference per visit count")
p <- p + ggthemes::theme_tufte(base_family="Helvetica") 
p <- p + theme(legend.position="none")
p <- p + theme(panel.grid=element_line(color="#2b2b2b", size=0.15))
p <- p + theme(panel.grid.minor=element_blank())
p <- p + theme(panel.grid.major.x=element_blank())
p <- p + theme(axis.ticks=element_blank())
p <- p + theme(strip.text=element_text(hjust=0))
p <- p + theme(panel.margin=unit(1.5, "cm"))
p <- p + theme(legend.position="none")
p

enter image description here

答案 1 :(得分:1)

您似乎正在尝试按先前访问次数比较对酒店总访问次数的贡献,并进行逐年比较。以下代码将它们放在一个图表中。

d$year  <- factor(d$year)
# d$visit <- factor(d$visit)
d <- transform(d[order(d$year, d$visit),], cum_count=ave(count, year, FUN=cumsum))

x_max <- max(d$visit)
y_max <- max(d$cum_count)
color_sch <- c("red","tan","blue")

p <- ggplot(data=d, aes(x=visit,  colour=year))
p <- p + geom_bar(aes(y= count, fill=year), position="dodge",stat="identity", width=.7)
p <- p + geom_line(aes(y = cum_count), linetype="dotted", size=1)
p <- p + geom_point(aes(y = cum_count), size=4)
p <- p + scale_y_continuous(breaks = seq(0,y_max, 250))
p <- p + scale_x_continuous(breaks=1:x_max)
p <- p + scale_colour_manual(values=color_sch)
p <- p + scale_fill_manual(values=color_sch)
p <- p + xlab("Visit") + ylab("Count and \nCummulative Count")
p <- p + geom_text(aes(x = 2, y  = count[2], label = "Count by Number of Visits"), hjust=-.5, vjust=-2.0, size=6, color="Black") 
p <- p + geom_text(aes(x = x_max-5, y  = tail(cum_count,1), label = "Cummulative Count"), hjust=0, vjust=2.0, size=6, color="Black")
# pdf("returners.pdf",9,6)
print(p)
# dev.off()

给出了图表

enter image description here

这种表述表明2015年与前几年相比下降的原因是首次购买的客户较少,而返还的次数减少了。