2017-10-14 112 views
1

請參考數據的輸入。您可以直接向下滾動至目標問題說明。也許你不需要像以前可能遇到此問題那樣的數據。數據陰影預測使用ggplot的R中時間序列的時間間隔

dput(ridership.ts) 
structure(c(1709L, 1621L, 1973L, 1812L, 1975L, 1862L, 1940L, 
2013L, 1596L, 1725L, 1676L, 1814L, 1615L, 1557L, 1891L, 1956L, 
1885L, 1623L, 1903L, 1997L, 1704L, 1810L, 1862L, 1875L, 1705L, 
1619L, 1837L, 1957L, 1917L, 1882L, 1933L, 1996L, 1673L, 1753L, 
1720L, 1734L, 1563L, 1574L, 1903L, 1834L, 1831L, 1776L, 1868L, 
1907L, 1686L, 1779L, 1776L, 1783L, 1548L, 1497L, 1798L, 1733L, 
1772L, 1761L, 1792L, 1875L, 1571L, 1647L, 1673L, 1657L, 1382L, 
1361L, 1559L, 1608L, 1697L, 1693L, 1836L, 1943L, 1551L, 1687L, 
1576L, 1700L, 1397L, 1372L, 1708L, 1655L, 1763L, 1776L, 1934L, 
2008L, 1616L, 1774L, 1732L, 1797L, 1570L, 1413L, 1755L, 1825L, 
1843L, 1826L, 1968L, 1922L, 1670L, 1791L, 1817L, 1847L, 1599L, 
1549L, 1832L, 1840L, 1846L, 1865L, 1966L, 1949L, 1607L, 1804L, 
1850L, 1836L, 1542L, 1617L, 1920L, 1971L, 1992L, 2010L, 2054L, 
2097L, 1824L, 1977L, 1981L, 2000L, 1683L, 1663L, 2008L, 2024L, 
2047L, 2073L, 2127L, 2203L, 1708L, 1951L, 1974L, 1985L, 1760L, 
1771L, 2020L, 2048L, 2069L, 1994L, 2075L, 2027L, 1734L, 1917L, 
1858L, 1996L, 1778L, 1749L, 2066L, 2099L, 2105L, 2130L, 2223L, 
2174L, 1931L, 2121L, 2076L, 2141L, 1832L, 1838L, 2132L), .Tsp = c(1991, 
2004.16666666667, 12), class = "ts") 

創建TS對象的數據幀的

調用所需的庫

library(zoo) 
library(ggplot2) 
library(scales) 
library(plotly) 
library(ggthemes) 
library(forecast) 
library(plotly) 
library(DescTools) 

dput使用ggplot

tsd = data.frame(time = as.Date(ridership.ts), 
       value = as.matrix(ridership.ts)) 

構建線性模型

ridership.lm <- tslm(ridership.ts ~ trend + I(trend^2)) 

添加新的列現有數據幀TSD

tsd$linear_fit = as.matrix(ridership.lm$fitted.values) 

定義驗證和訓練週期的長度

nValid = 36 
nTrain = length(ridership.ts) - nValid 

訓練數據

train.ts = window(ridership.ts, 
        start = c(1991, 1), 
        end = c(1991, nTrain)) 

驗證數據

valid.ts = window(ridership.ts, 
        start = c(1991, nTrain + 1), 
        end = c(1991, nTrain + nValid)) 

建築用模型

ridership.lm = tslm(train.ts ~ trend + I(trend^2)) 

預測我們的構建模型

ridership.lm.pred = forecast(ridership.lm, h = nValid, level = 0) 

使數據幀擬合模型值

tsd_train_model = data.frame(time = as.Date(train.ts), 
          lm_fit_train = as.matrix(ridership.lm$fitted.values)) 

使數據幀密謀目的

forecast_df = data.frame(time = as.Date(valid.ts), 
         value = as.matrix(ridership.lm.pred$mean)) 

使用ggplot

p1 = ggplot(data = tsd, 
      aes(x = time, y = value)) + 
    geom_line(color = 'blue') + 
    ylim(1300, 2300) + 
    geom_line(data = tsd_train_model, 
      aes(x = time, y = lm_fit_train), 
      color = 'red') 

p2 = p1 + 
    geom_line(data = forecast_df, 
      aes(x = time, y = value), 
      col = 'red', linetype = 'dotted') + 
    scale_x_date(breaks = date_breaks('1 years'), 
       labels = date_format('%b-%y')) + 
    geom_vline(xintercept = as.numeric(c(tsd_train_model[NROW(tsd_train_model), ]$time, #last date of training period 
             forecast_df[NROW(forecast_df), ]$time))) #last date of testing period 

p3 = p2 + 
    annotate('text', 
      x = c(tsd_train_model[NROW(tsd_train_model)/2, ]$time, 
       forecast_df[NROW(forecast_df)/2,]$time), 
      y = 2250, 
      label = c('Training Period', 'Validation Period')) 

enter image description here

目的創建情節:我要加5個百分點和95個百分點的預測線(紅色虛線在此圖中)兩側的預測誤差和遮陽地區。

我用位數爲產區預測範圍

q = quantile(ridership.lm.pred$residuals, c(.05, .95)) 

percentile_5 = as.numeric(q[1]) 
percentile_95 = as.numeric(q[2]) 

添加5百分位數和95百分位數的預測數據

yl = forecast_df$value + percentile_5 
ym = forecast_df$value + percentile_95 

問題:如果我使用下面的命令,然後它不顯示整個驗證週期的陰影區域。

p3 + geom_ribbon(data = forecast_df, 
       aes(ymin = yl, 
        ymax = ym), 
       fill="gray30") 

enter image description here

NROW(yl) 
[1]36 

sum(is.na(yl)) 
[1] 0 

NROW(ym) 
[1] 36 

sum(is.na(ym)) 
[1] 0 

事情嘗試:如果我更換y最小和y的值通過任何其他值 例如如果我使用下面的命令,然後我得到的只是顯示的數字低於命令

p3 + geom_ribbon(data = forecast_df, 
       aes(ymin = rep(1750,36), 
        ymax = rep(2000,36), 
        fill="gray30")) 

enter image description here

我的問題:

誰能告訴我在圖2的輸出背後的原因是什麼?爲什麼R會給出如圖2所示的奇怪輸出?

任何人都可以請幫助我陰影使用ggplot的完整區域?

+1

請指出您在代碼中使用的所有軟件包。我不認爲'tslm'是基本軟件包的一部分。與'ts'對象轉換爲'Date'對象相同。這將幫助其他人重現您的問題以進行故障排除。 –

回答

2

TLDR:從您的ggplot代碼中刪除ylim(1300, 2300) +行。

當你使用scale_x_***()/scale_y_***(或等價xlim()/ylim())設置你的情節的限制,該地塊將扔掉落在此範圍之外的所有數據點。如果geom_ribbon需要ymin & ymax值,當與ymax相對應的值被刪除(因爲它們大於2300),則不能僅用ymin繪製功能區,因此功能區在此之前停止。

如果您真的只想繪製範圍(1300,2300),請在coord_cartesian()內設置限制。這使得繪圖可以放大到範圍限制,而不會丟棄外部的數據點。有關更多信息,請參閱documentation

其他非必要的建議如下:

對於ggplot策劃,我平時儘量保持相同的數據幀中的一切,儘可能地,利用在審美映射公共變量。以下是我想做到這一點:

一切組合到一個單獨的數據幀:

library(dplyr) 
df <- left_join(tsd %>% select(time, value), 
       rbind(tsd_train_model %>% 
         rename(fit = lm_fit_train) %>% 
         mutate(status = "train"), 
         forecast_df %>% 
         rename(fit = value) %>% 
         mutate(status = "valid"))) 
df <- df %>% 
    mutate(yl = ifelse(status == "valid", fit + percentile_5, NA), 
     ym = ifelse(status == "valid", fit + percentile_95, NA)) 

> head(df) 
     time value  fit status yl ym 
1 1991-01-01 1709 1882.681 train NA NA 
2 1991-02-01 1621 1876.546 train NA NA 
3 1991-03-01 1973 1870.518 train NA NA 
4 1991-04-01 1812 1864.597 train NA NA 
5 1991-05-01 1975 1858.784 train NA NA 
6 1991-06-01 1862 1853.078 train NA NA 

> tail(df) 
      time value  fit status  yl  ym 
154 2003-10-01 2121 2190.490 valid 1934.914 2397.875 
155 2003-11-01 2076 2200.756 valid 1945.179 2408.141 
156 2003-12-01 2141 2211.129 valid 1955.553 2418.514 
157 2004-01-01 1832 2221.609 valid 1966.033 2428.994 
158 2004-02-01 1838 2232.197 valid 1976.620 2439.582 
159 2004-03-01 2132 2242.891 valid 1987.315 2450.277 

情節

ggplot(data = df, 
     aes(x = time)) + 

    # place the ribbon below all other geoms for easier viewing, & increase transparency 
    geom_ribbon(aes(ymin = yl, ymax = ym), fill = "gray30", alpha = 0.2) + 

    # original values 
    geom_line(aes(y = value), color = "blue") + 

    # fitted values (line type differs by training/validation) 
    geom_line(aes(y = fit, linetype = status), color = "red") + 

    # indicates validation range 
    geom_vline(xintercept = c(min(df$time[df$status=="valid"]), 
          max(df$time[df$status=="valid"]))) + 

    scale_x_date(breaks = scales::date_breaks('1 year'), 
       labels = scales::date_format('%b-%y')) + 

    # hide legend for line type (comment this line out if you want to show it) 
    scale_linetype(guide = F) + 

    # limits can be tweaked here 
    coord_cartesian(ylim = c(1300, 2500)) + 

    # plain white plot background for easier viewing 
    theme_classic() 

plot

編輯:替代解決方案,使傳說容易:

# create long data frame where all values (original/training/validation) are 
# in the same column 
df2 <- rbind(tsd %>% select(time, value) %>% 
       mutate(status = "original"), 
      tsd_train_model %>% 
       rename(value = lm_fit_train) %>% 
       mutate(status = "train"), 
      forecast_df %>% 
       mutate(status = "valid")) %>% 
    mutate(yl = ifelse(status == "valid", value + percentile_5, NA), 
     ym = ifelse(status == "valid", value + percentile_95, NA)) 

# in the scales for colour/line type, define the same labels in order to 
# combine the two legends 
ggplot(data = df2, 
     aes(x = time)) + 
    geom_ribbon(data = subset(df2, !is.na(yl)), 
       aes(ymin = yl, ymax = ym, fill = "interval"), alpha = 0.2) + 
    geom_line(aes(y = value, color = status, linetype = status)) + 
    geom_vline(xintercept = c(min(df2$time[df$status=="valid"]), 
          max(df2$time[df$status=="valid"]))) + 
    scale_x_date(breaks = scales::date_breaks('1 year'), 
       labels = scales::date_format('%b-%y')) + 
    scale_color_manual(name = "", 
        values = c("original" = "blue", 
           "train" = "red", 
           "valid" = "red")) + 
    scale_linetype_manual(name = "", 
       values = c("original" = "solid", 
          "train" = "solid", 
          "valid" = "longdash")) + 
    scale_fill_manual(name = "", 
        values = c("interval" = "gray30")) + 
    coord_cartesian(ylim = c(1300, 2500)) + 
    theme_classic() + 
    theme(legend.position = "bottom") 

plot2

+0

你能告訴我如何自動和手動地將圖例添加到此圖表嗎? – user110244

+0

你想要什麼傳說?顏色(對於原始觀察爲藍色,對於擬合值爲紅色)/線型(對於訓練是不間斷的,對於驗證是虛線)/誤差範圍? –

+0

是的。你是對的。你能告訴我如何添加這些?我試過scale_color_manual,但這似乎並不奏效。請指導 – user110244