上海地铁数据可视化

## encoding
options(encoding = "UTF-8")      ## for chines
## use getOption("encoding") to see if things were changed

loc <- function(os, language = "english") {
  switch(language,
         english = ifelse(os == "Windows", "English_United States.1252", "en_US.UTF-8"),
         chinese = ifelse(os == "Windows", "Chinese", "zh_CN.utf-8"))
}
## set locale
Sys.setlocale(category = "LC_ALL", loc(Sys.info()[["sysname"]], "chinese"))

##加载工作环境和所需包
setwd("/Users/jeevanyue/Rproject/map/SHMetro")
library(data.table)
library(bit64)
library(dplyr)
library(tidyr)
library(scales)
library(lubridate)  #日期处理包
library(shiny)
library(leaflet)
library(lattice)
library(plotly)
library(chorddiag)  #绘制chord

1. 地铁数据

#地铁站进站数据和出站数据
shmetro_in <- fread("data/shmetro_in.csv",encoding="UTF-8")
shmetro_out <- fread("data/shmetro_out.csv",encoding="UTF-8")

## 进出地铁站数据
shmetro_line_in_out <- fread("data/shmetro_line_in_out.csv",encoding="UTF-8")

## 进出地铁站关联
in_out <- shmetro_line_in_out %>%
  spread(line_out,count)
in_out[is.na(in_out)]<-0

## 地铁站经纬度
stations <- fread("data/stations.csv",encoding="UTF-8")
stations <- stations %>% 
  select(c(1:5)) %>%
  arrange(line,line_id)
stations_no <- nrow(stations)
for (i in 1:stations_no) {
    s <- stations$station[i]
    stations$lines[i] <- paste(stations[stations$station==s,]$line,sep="",collapse="/")
}

1.1 地铁站经纬度

stations <- fread("data/stations.csv",encoding="UTF-8")
stations <- stations %>% 
  select(c(1:5)) %>%
  arrange(line,line_id)

stations_no <- nrow(stations)
for (i in 1:stations_no) {
    s <- stations$station[i]
    stations$lines[i] <- paste(stations[stations$station==s,]$line,sep="",collapse="/")
}
invisible(gc())

1.2 交通卡交易数据

交通卡的交易信息有7个字段,分别是:卡号、交易日期、交易时间、站点名称、行业名称、交易金额、交易性质。

  • 卡号:交通卡卡号
  • 交易日期:日期格式yyyy-mm-dd
  • 交易时间:时间个是hh:mm:ss
  • 站点名称:内容包括线路和站名,如:“1号线莘庄”
  • 行业名称:都是“地铁”
  • 交易金额:0和大于0的值,0表示进站,大于0的值表示出战
  • 交易性质:“优惠”和“非优惠”
#mac下用如下读取
system.time(trade <- read.csv("/Users/jeevanyue/Desktop/SPTCC-20150401.csv",header = F,sep=",", fileEncoding = "GB2312"))
#windows下用如下读取
#system.time(trade <- fread("SPTCC-20150401/SPTCC-20150401.csv",integer64='character',stringsAsFactors=F))
#trade <- read.csv('data/SPTCC-20150401_Sample.txt',header=T,encoding='UTF-8',stringsAsFactors = F)

#重命名
names(trade) <- c('card_id','date','time','station','vehicle','money','property')
#筛选地铁数据
trade_metro <- trade %>%
  filter(vehicle=='地铁')
rm(trade)
invisible(gc())

#将"station"(原含义为线路和站名),分为"line"和"station"
trade_metro <- trade_metro %>% 
  separate(station, c('line', 'station'), sep = '号线')
invisible(gc())

#按五分钟统计时间,向上取整
trade_metro <- trade_metro %>%
  mutate(M5=ceiling(period_to_seconds(hms(time))/300))
invisible(gc())

#删除不需要的字段
trade_metro <- trade_metro %>%
  select(-vehicle,-property,-date)
invisible(gc())

#时间格式
#trade_metro$time <-  strptime(paste("2015-04-01", trade_metro$time, sep=' '), "%Y-%m-%d %H:%M:%S", tz = "GMT")

1.3 处理异常值

## 对与stations地铁站名不一致的trade数据进行处理
trade_metro[trade_metro$station=="淞浜路",]$station <- "淞滨路"
trade_metro[trade_metro$station=="大木桥路 ",]$station <- "大木桥路"
trade_metro[trade_metro$station=="上海大学站",]$station <- "上海大学"

1.4 进/出站数据

## 进站数据
trade_metro_in <- trade_metro %>%
  filter(money==0) %>%
  select(card_id,"time_in"=time,"line_in"=line,"station_in"=station,"M5_in"=M5)
## 出站数据
trade_metro_out <- trade_metro %>%
  filter(money>0)%>%
  select(card_id,"time_out"=time,"line_out"=line,"station_out"=station,money,"M5_out"=M5)

1.5 虚拟换乘

上海火车站为虚拟换乘,删除半小时内3/4换1和1换3/4的数据

3/4换1的数据

## 3/4换1的数据,统计发现在上海火车站3/4号线出站以3号线名义出站
trade_metro_out_34 <- trade_metro_out %>%
  filter(station_out=='上海火车站') %>%
  filter(line_out==3 | line_out==4)
trade_metro_in_1 <- trade_metro_in %>%
  filter(station_in=='上海火车站', line_in==1)

## merge出站和进站的数据
trade_metro_out34_in1 <- merge(trade_metro_out_34,trade_metro_in_1,all.x=T) %>%
  mutate(duration=period_to_seconds(hms(time_in)) - period_to_seconds(hms(time_out))) %>%
  filter(duration>0,duration<=60*30)

### 数据大了什么样的数据都有,发现有几个人在3/4号线出站后,半小时内在1号线进站多次

#trade_metro_out34_in1 <- na.omit(trade_metro_out34_in1)

## 根据卡号和进站时间,查询最近的进站时间,作为本次进站时间
trade_metro_out34_in1 <- data.table(trade_metro_out34_in1)
trade_metro_out34_in1[, duration_min := min(duration), by=list(card_id, M5_in)]
trade_metro_out34_in1 <- trade_metro_out34_in1 %>%
  filter(duration==duration_min) %>%
  select(-duration_min)

## 统计发现绝大部分人在10分钟内完成换乘
#histogram(ceiling(trade_metro_out34_in1$duration/60))

## rbind出站数据
trade_metro_out <- rbind(trade_metro_out, trade_metro_out34_in1[,c(1:6)])
## 删除全部重复的出站数据
trade_metro_out <- trade_metro_out[!(duplicated(trade_metro_out) | duplicated(trade_metro_out, fromLast = TRUE)), ]

## rbind进站数据
trade_metro_in <- rbind(trade_metro_in, trade_metro_out34_in1[,c(1,7:10)])
## 删除全部重复的进站数据
trade_metro_in <- trade_metro_in[!(duplicated(trade_metro_in) | duplicated(trade_metro_in, fromLast = TRUE)), ]

1换3/4的数据

## 1换3/4的数据,统计发现在3/4号线上海火车站以3号线名义进站
trade_metro_out_1 <- trade_metro_out %>%
  filter(station_out=='上海火车站',line_out==1) 
trade_metro_in_34 <- trade_metro_in %>%
  filter(station_in=='上海火车站') %>%
  filter(line_in==3 | line_in==4)

## merge出站和进站的数据
trade_metro_out1_in34 <- merge(trade_metro_out_1,trade_metro_in_34,all.x=T, all.y=F) %>%
  mutate(duration=period_to_seconds(hms(time_in)) - period_to_seconds(hms(time_out))) %>%
  filter(duration>0,duration<=60*30)

#trade_metro_out1_in34 <- na.omit(trade_metro_out1_in34)

## 根据卡号和进站时间,查询最近的进站时间,作为本次进站时间
trade_metro_out1_in34 <- data.table(trade_metro_out1_in34)
trade_metro_out1_in34[, duration_min := min(duration), by=list(card_id, M5_in)]
trade_metro_out1_in34 <- trade_metro_out1_in34 %>%
  filter(duration==duration_min) %>%
  select(-duration_min)

## 统计发现绝大部分人在10分钟内完成换乘
#histogram(ceiling(trade_metro_out1_in34$duration/60))

## rbind出站数据
trade_metro_out <- rbind(trade_metro_out, trade_metro_out1_in34[,c(1:6)])
## 删除全部重复的出站数据
trade_metro_out <- trade_metro_out[!(duplicated(trade_metro_out) | duplicated(trade_metro_out, fromLast = TRUE)), ]

## rbind进站数据
trade_metro_in <- rbind(trade_metro_in, trade_metro_out1_in34[,c(1,7:10)])
## 删除全部重复的进站数据
trade_metro_in <- trade_metro_in[!(duplicated(trade_metro_in) | duplicated(trade_metro_in, fromLast = TRUE)), ]

1.6 地铁站进站数据

根据消费金额为0,每5分钟统计每站地铁的进站人数

trade_metro_in_station <- trade_metro_in %>%
  group_by(station_in, M5_in) %>%
  summarise(count=n()) %>%
  select(station=station_in, M5=M5_in, count)

#trade_metro_in_station <- na.omit(trade_metro_in_station)
invisible(gc())

## 合并地铁站坐标
shmetro_in <- merge(trade_metro_in_station,stations,all.x=T, all.y=F)
#rm(trade_metro_in_station)

## 查看未匹配到的地铁站
#l <- shmetro_in[is.na(shmetro_in$gps_lat),]
#unique(l$station)
#trade_metro_in_station[trade_metro_in_station$station=="淞浜路",]$station <- "淞滨路"
#trade_metro_in_station[trade_metro_in_station$station=="大木桥路 ",]$station <- "大木桥路"
#trade_metro_in_station[trade_metro_in_station$station=="上海大学站",]$station <- "上海大学"
#stations[grepl("淞滨路", stations$station),]$station
#trade_metro_in_station[grepl("淞浜路", trade_metro_in_station$station),]$station <- "淞滨路"

#shmetro_in <- na.omit(shmetro_in)
invisible(gc())
#write.csv(shmetro_in,"shmetro_in.csv",row.names = F,fileEncoding="UTF-8")

1.7 地铁站出站数据

根据消费金额大雨0,每5分钟统计每站地铁的出站人数

trade_metro_out_station <- trade_metro_out %>%
  group_by(station_out, M5_out) %>%
  summarise(count=n()) %>%
  select(station=station_out, M5=M5_out, count)

#trade_metro_out_station <- na.omit(trade_metro_out_station)
invisible(gc())

## 合并地铁站坐标
shmetro_out <- merge(trade_metro_out_station,stations,all.x=T, all.y=F)
#rm(trade_metro_out_station)

#shmetro_out <- na.omit(shmetro_out)
invisible(gc())
#write.csv(shmetro_out,"shmetro_out.csv",row.names = F,fileEncoding="UTF-8")

1.8 地铁线路起始和终点

## merge进站和出站数据,并计算乘坐时间
trade_metro_in_out <- merge(trade_metro_in, trade_metro_out, all.x=T, all.y=F) %>%
  mutate(duration=period_to_seconds(hms(time_out)) - period_to_seconds(hms(time_in)), duration_M5=M5_out-M5_in) %>%
  filter(duration>0)

#根据卡号和进站时间,查询最近出站的时间,作为本次出站时间
trade_metro_in_out <- data.table(trade_metro_in_out)
trade_metro_in_out[, duration_min := min(duration), by=list(card_id, M5_in)]
trade_metro_in_out <- trade_metro_in_out %>%
  filter(duration==duration_min) %>%
  select(-duration_min)

#统计进站线路A->出站线路B的笔数
shmetro_line_in_out <- trade_metro_in_out %>%
  group_by(line_in,line_out) %>%
  summarise(count=n())

shmetro_line_in_out$line_in <- as.numeric(shmetro_line_in_out$line_in)
shmetro_line_in_out$line_out <- as.numeric(shmetro_line_in_out$line_out)

#排序
shmetro_line_in_out <- arrange(shmetro_line_in_out,line_in,line_out)

#将出站线路数据转换为属性字段
in_out <- shmetro_line_in_out %>%
  spread(line_out,count)

in_out[is.na(in_out)]<-0

#write.csv(shmetro_line_in_out,"shmetro_line_in_out.csv",row.names = F,fileEncoding="UTF-8")

2. 绘图

2.1 相关数据及地图

#地铁颜色
lines_color <- data.frame("line"=c(1:13,16),"color"=c("#ED3229","#36B854","#FFD823","#320176","#823094","#CF047A","#F3560F","#008CC1","#91C5DB","#C7AFD3","#8C2222","#007a61","#ec91cc","#32D2CA"))

pal <- colorFactor(as.character(lines_color$color), domain = stations$line)

#辅助函数绘制线路
draw_line_add <- function(l_no,line_s_id=NULL){
  line_color <- lines_color[lines_color$line==l_no,]$color
  line_data <- stations[stations$line==l_no,]
  if(is.null(line_s_id)){
    draw_lines <- Shanghai %>%
      addPolylines(lat=line_data$gps_lat,lng=line_data$gps_lon,color=line_color,weight=2)
  }else{
    draw_lines <- Shanghai %>%
      addPolylines(lat=line_data$gps_lat[line_s_id],lng=line_data$gps_lon[line_s_id],color=line_color,weight=2)
  }
  return(draw_lines)
}

## 上海线路地图
Shanghai <- leaflet() %>% 
  setView(lng = 121.60, lat = 31.20, zoom = 10) %>% 
  addProviderTiles("CartoDB.Positron") %>%
  addLegend(position = "bottomleft",pal=pal,values = stations$line)

for(l in unique(stations$line)){
  line_length <- nrow(stations[stations$line==l,])
  if(l==4){
    #由于4号线为环线,需将首尾相连
    Shanghai <- draw_line_add(l_no=l)
    Shanghai <- draw_line_add(l_no=l,line_s_id=c(1,line_length))
  }else if(l==10){
    #由于10号线在龙溪路站以后分为两条线路,需分两端绘制
    Shanghai <- draw_line_add(l_no=l,line_s_id=c(1:(line_length-3)))
    Shanghai <- draw_line_add(l_no=l,line_s_id=c(24,(line_length-2):line_length))
  }else if(l==11){
    #由于11号线在嘉定新城站以后分为两条线路,需分两端绘制
    Shanghai <- draw_line_add(l_no=l,line_s_id=c(1:(line_length-7)))
    Shanghai <- draw_line_add(l_no=l,line_s_id=c(28,(line_length-6):line_length))
  }else{
    Shanghai <- draw_line_add(l_no=l)
  }
}

2.2 chord图数据

##绘制chord图
metro_chord <- data.matrix(as.data.frame(in_out)[,c(2:15)])
haircolors <- in_out$line_in
dimnames(metro_chord) <- list(have = haircolors,
                    prefer = colnames(metro_chord))

groupColors <- c("#ED3229","#36B854","#FFD823","#320176","#823094","#CF047A","#F3560F","#008CC1","#91C5DB","#C7AFD3","#8C2222","#007a61","#ec91cc","#32D2CA")

#chorddiag(metro_chord, groupColors = groupColors, margin=50, showTicks=F, groupnamePadding = 5)

2.3 图形参数

b <- list(x = 0, y = 1,bgcolor = "#00FFFFFF")
yax <- list(
  title = "",
  zeroline = FALSE,
  showline = FALSE,
  showticklabels = FALSE,
  showgrid = FALSE
)

xax <- list(
  title = "",
  titlefont = list(size = 8),
  tickangle = -20,
  color = "black"
)

2.4 UI 和 SERVER

ui <- shinyUI(navbarPage("SHMetro",

  tabPanel("进站流量",
    div(class="outer",
            
      #tags$style(type = "text/css", "html, body {width:100%;height:100%}"),
      tags$style(type = "text/css", ".outer {position: fixed; top: 41px; left: 0; right: 0; bottom: 0; overflow: hidden; padding: 0}"),
      
      leafletOutput("map", width = "100%", height = "100%"),
      absolutePanel(top = 10, right = 10,
                    h4(textOutput("output_slider_time")),
                    sliderInput("slider_time", "Time:",
                                #min=as.POSIXct(min(filter(shmetro_in, M5>30)$M5)*5*60, origin = "2015-04-01", tz = "GMT"),
                                #max=as.POSIXct(max(shmetro_in$M5)*5*60, origin = "2015-04-01", tz = "GMT"),
                                #value=as.POSIXct(min(shmetro_in$M5)*5*60, origin = "2015-04-01", tz = "GMT"),
                                min = as.POSIXct(5*60*60, origin = "2015-04-01", tz = "GMT"),
                                max = as.POSIXct(24*60*60, origin = "2015-04-01", tz = "GMT"),
                                value = as.POSIXct(5*60*60, origin = "2015-04-01", tz = "GMT"),
                                step = 60*5,
                                timeFormat = "%T",
                                timezone = "GMT"),
                    selectInput("select_line", "Line",
                                c("All",lines_color$line)),
                    h4("TOP 5"),
                    plotlyOutput("in_top5",height = 200),
                    checkboxInput("legend", "Show legend", TRUE)
                    )
      )
    ),
    tabPanel("出站流量",
      div(class="outer",
              
        #tags$style(type = "text/css", "html, body {width:100%;height:100%}"),
        tags$style(type = "text/css", ".outer {position: fixed; top: 41px; left: 0; right: 0; bottom: 0; overflow: hidden; padding: 0}"),
        
        leafletOutput("map_out", width = "100%", height = "100%"),
        absolutePanel(top = 10, right = 10,
                      h4(textOutput("output_slider_time_out")),
                      sliderInput("slider_time_out", "Time:",
                                  #min=as.POSIXct(min(filter(shmetro_in, M5>30)$M5)*5*60, origin = "1960-01-01", tz = "GMT"),
                                  #max=as.POSIXct(max(shmetro_in$M5)*5*60, origin = "1960-01-01", tz = "GMT"),
                                  #value=as.POSIXct(min(shmetro_in$M5)*5*60, origin = "1960-01-01", tz = "GMT"),
                                min = as.POSIXct(5*60*60, origin = "2015-04-01", tz = "GMT"),
                                max = as.POSIXct(24*60*60, origin = "2015-04-01", tz = "GMT"),
                                value = as.POSIXct(5*60*60, origin = "2015-04-01", tz = "GMT"),
                                  step = 60*5,
                                  timeFormat = "%T",
                                  timezone = "GMT"),
                      selectInput("select_line_out", "Line",
                                  c("All",lines_color$line)),
                      h4("TOP 5"),
                      plotlyOutput("out_top5",height = 200),
                      checkboxInput("legend_out", "Show legend", TRUE)
                      )
        )
    ),
    tabPanel("线路关联",
      div(class="outer",
              
        #tags$style(type = "text/css", "html, body {width:100%;height:100%}"),
        tags$style(type = "text/css", ".outer {position: fixed; top: 41px; left: 0; right: 0; bottom: 0; overflow: hidden; padding: 0}"),
        chorddiagOutput("line_chord", width = "100%",height="100%")
      )
    )
  )
)

server <- shinyServer(function(input, output, session) {
  ## 进站流量统计
  # Reactive expression for the data subsetted to what the user selected
  filteredData <- reactive({
    if(input$select_line=="All"){
      shmetro_in %>%
        filter(M5==ceiling(period_to_seconds(hms(format(input$slider_time,"%H:%M:%S")))/300))
    }else{
      shmetro_in %>%
        filter(M5==ceiling(period_to_seconds(hms(format(input$slider_time,"%H:%M:%S")))/300),line==as.numeric(input$select_line))
    }
  })
  
  stations_in_top5 <- reactive({
    filteredData() %>%
      group_by(station) %>%
      summarise(count=sum(count),line=min(line)) %>%
      arrange(desc(count)) %>%
      head(5) %>%
      as.data.frame()
  })

  ## time
  output$output_slider_time  <- renderText({
    paste0("Time: ", format(input$slider_time,"%H:%M:%S"))
  })
  
  output$map <- renderLeaflet({
    Shanghai %>%
      addCircles(stations$gps_lon, stations$gps_lat,color = pal(stations$line), radius=1,popup = paste(stations$station,stations$lines),fillOpacity = 1,stroke = FALSE) %>%
      clearMarkerClusters() %>%
      clearMarkers()
  })

  observe({
    data_in_circle <- data.table(filteredData())[, count := sum(count), by=list(station, M5)] %>%
      arrange(count)
    
    leafletProxy("map", data = data_in_circle) %>%
      clearMarkerClusters() %>%
      clearMarkers() %>%
      addCircleMarkers(data_in_circle$gps_lon,data_in_circle$gps_lat, color = pal(data_in_circle$line), fillOpacity = 0.5,stroke = FALSE, popup=paste(data_in_circle$station,data_in_circle$line,data_in_circle$count,sep=","), radius=(data_in_circle$count)^(1/2.5))
  })
  
  # top5
  output$in_top5 <- renderPlotly({
    # If no stations_in_top5 are in view, don't plot
    if (nrow(stations_in_top5()) == 0)
      return(NULL)

    plot_ly(stations_in_top5(),
            x = stations_in_top5()$station,
            y = stations_in_top5()$count,
            type = "bar",
            marker = list(color = pal(stations_in_top5()$line)),
            bgcolor = "#00FFFFFF") %>%
      layout(showlegend=FALSE,
             yaxis=yax,xaxis=xax,plot_bgcolor='#00FFFFFF',
             paper_bgcolor='#00FFFFFF')
  })
  
  # Use a separate observer to recreate the legend as needed.
  observe({
    proxy <- leafletProxy("map")
    # Remove any existing legend, and only if the legend is
    # enabled, create a new one.
    proxy %>% clearControls()
    if (input$legend) {
      proxy %>% addLegend(position = "bottomleft",pal=pal,values = stations$line)
    }
  })

  ## 出站流量统计
  # Reactive expression for the data subsetted to what the user selected
  filteredData_out <- reactive({
    if(input$select_line_out=="All"){
      shmetro_out %>%
        filter(M5==ceiling(period_to_seconds(hms(format(input$slider_time_out,"%H:%M:%S")))/300))
    }else{
      shmetro_in %>%
        filter(M5==ceiling(period_to_seconds(hms(format(input$slider_time_out,"%H:%M:%S")))/300),line==as.numeric(input$select_line_out))
    }
  })
  
  stations_out_top5 <- reactive({
    filteredData_out() %>%
      group_by(station) %>%
      summarise(count=sum(count),line=min(line)) %>%
      arrange(desc(count)) %>%
      head(5) %>%
      as.data.frame()
  })

  ## time
  output$output_slider_time_out  <- renderText({
    paste0("Time: ", format(input$slider_time_out,"%H:%M:%S"))
  })
  
  output$map_out <- renderLeaflet({
    Shanghai %>%
      addCircles(stations$gps_lon, stations$gps_lat,color = pal(stations$line), radius=1,popup = paste(stations$station,stations$lines),fillOpacity = 1,stroke = FALSE) %>%
      clearMarkerClusters() %>%
      clearMarkers()
  })

  observe({
    data_out_circle <- data.table(filteredData_out())[, count := sum(count), by=list(station, M5)] %>%
      arrange(count)
    
    leafletProxy("map_out", data = filteredData_out()) %>%
      clearMarkerClusters() %>%
      clearMarkers() %>%
      addCircleMarkers(data_out_circle$gps_lon, data_out_circle$gps_lat, color = pal(data_out_circle$line),fillOpacity = 0.5,stroke = FALSE,  popup=paste(data_out_circle$station,data_out_circle$line,data_out_circle$count,sep=","), radius=(data_out_circle$count)^(1/2.5))
  })

  # top5
  output$out_top5 <- renderPlotly({
    # If no stations_in_top5 are in view, don't plot
    if (nrow(stations_out_top5()) == 0)
      return(NULL)

    plot_ly(stations_out_top5(),
            x = stations_out_top5()$station,
            y = stations_out_top5()$count,
            type = "bar",
            marker = list(color = pal(stations_out_top5()$line)),
            bgcolor = "#00FFFFFF") %>%
      layout(showlegend=FALSE,
             yaxis=yax,xaxis=xax,plot_bgcolor='#00FFFFFF',
             paper_bgcolor='#00FFFFFF')
  })
  
  # Use a separate observer to recreate the legend as needed.
  observe({
    proxy <- leafletProxy("map_out")
    # Remove any existing legend, and only if the legend is
    # enabled, create a new one.
    proxy %>% clearControls()
    if (input$legend_out) {
      proxy %>% addLegend(position = "bottomleft",pal=pal,values = stations$line)
    }
  })
  
  ## 线路关联
  output$line_chord <- renderChorddiag({
    chorddiag(metro_chord, groupColors = groupColors, showTicks=F, groupnamePadding = 5)
  })
})

2.5 运行shinyApp

shinyApp(ui = ui,server = server)

进站流量

出站流量

进出地铁

Avatar
Jeevan Yue
Data Analyst

Related