2017-06-03 67 views
2

我需要找到此值的平均值,其中 class(newtemp) 是character。數據原來是10.6°C。我刪除了°C但空間在那裏因爲.numeric不工作。無法從數據框中刪除空白,因此無法找到平均值

newtemp 

[1] "-10.6 " "-10.6 " "-11.0 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " 

[12] "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-11.0 " 

[23] "-10.6 " "-10.6 " "-10.6 " "-11.0 " "-10.6 " "-10.0 " "-10.0 " "-10.0 " "-10.0 " "-10.0 " "-9.4 " 

[34] "-8.9 " "-8.3 " "-7.2 " "-7.2 " "-5.6 " "-5.0 " "-3.9 " "-3.9 " "-3.9 " "-3.3 " "-3.3 " 

[45] "-3.9 " "-6.1 " "-8.3 " "-7.8 " "-8.9 " "-10.0 " "-11.7 " "-12.8 " 


#Tried this 
library(stringr) 
try=str_replace_all(newtemp, fixed(" "), "") but not able to remove 
#Tried this also 
trim <- function (x) gsub("^\\s+|\\s+$", "", x) 
trim(x =newtemp) 
# STill not removed 

as.numeric(try) 

[1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 

[36] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 
#Warning message: 
#NAs introduced by coercion 
#Sill no output. 

讀音字使用的代碼是:

library(rvest) 
linkurl="https://www.wunderground.com/history/airport/KVAY/2015/2/17/DailyHistory.html?req_city=Cherry+Hill&req_state=NJ&req_statename=New+Jersey&reqdb.zip=08002&reqdb.magic=1&reqdb.wmo=99999&MR=1" 
weathertable=read_html(linkurl)%>%html_node("#obsTable")%>%html_table() 
weathertable 
newtemp=weathertable$Temp. 

abc=(gsub("°C", "", newtemp)) 
abc 
abc_new=(gsub("[[:space:]]", "", abc)) 
as.numeric(abc_new) 
trimws(x=abc) 
as.numeric(trimws(x=abc)) 

更新

 > newtemp 
    [1] "-10.6 °C" "-10.6 °C" "-11.0 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" 
    [10] "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" 
    [19] "-10.6 °C" "-10.6 °C" "-10.6 °C" "-11.0 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-11.0 °C" "-10.6 °C" 
    [28] "-10.0 °C" "-10.0 °C" "-10.0 °C" "-10.0 °C" "-10.0 °C" "-9.4 °C" "-8.9 °C" "-8.3 °C" "-7.2 °C" 
    [37] "-7.2 °C" "-5.6 °C" "-5.0 °C" "-3.9 °C" "-3.9 °C" "-3.9 °C" "-3.3 °C" "-3.3 °C" "-3.9 °C" 
    [46] "-6.1 °C" "-8.3 °C" "-7.8 °C" "-8.9 °C" "-10.0 °C" "-11.7 °C" "-12.8 °C" 
    > abc=(gsub(" °C", "", newtemp)) 
    > abc 
    [1] "-10.6 °C" "-10.6 °C" "-11.0 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" 
    [10] "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" 
    [19] "-10.6 °C" "-10.6 °C" "-10.6 °C" "-11.0 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-11.0 °C" "-10.6 °C" 
    [28] "-10.0 °C" "-10.0 °C" "-10.0 °C" "-10.0 °C" "-10.0 °C" "-9.4 °C" "-8.9 °C" "-8.3 °C" "-7.2 °C" 
    [37] "-7.2 °C" "-5.6 °C" "-5.0 °C" "-3.9 °C" "-3.9 °C" "-3.9 °C" "-3.3 °C" "-3.3 °C" "-3.9 °C" 
    [46] "-6.1 °C" "-8.3 °C" "-7.8 °C" "-8.9 °C" "-10.0 °C" "-11.7 °C" "-12.8 °C" 
    > abc=(gsub("°C", "", newtemp)) 
> abc 
[1] "-10.6 " "-10.6 " "-11.0 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " 
[12] "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-11.0 " 
[23] "-10.6 " "-10.6 " "-10.6 " "-11.0 " "-10.6 " "-10.0 " "-10.0 " "-10.0 " "-10.0 " "-10.0 " "-9.4 " 
[34] "-8.9 " "-8.3 " "-7.2 " "-7.2 " "-5.6 " "-5.0 " "-3.9 " "-3.9 " "-3.9 " "-3.3 " "-3.3 " 
[45] "-3.9 " "-6.1 " "-8.3 " "-7.8 " "-8.9 " "-10.0 " "-11.7 " "-12.8 " 
> 
+0

您能否顯示'lapply(newtemp [1:5],charToRaw)'的輸出來理解「空白」的字符代碼? –

+0

顯示你如何去除centigrades。 –

+0

@RYoda您的命令給輸出如下:[[1]] [1] 2D 31 30 2E 36 C2 A0 [[2]] [1] 2D 31 30 2E 36 C2 A0 [[3] ] [1] 2D 31 31 2E 30 C2 A0 [[4]] [1] 2D 31 30 2E 36 C2 A0 [[5]] [1] 2D 31 30 2E 36 C2 A0 –

回答

1

您的字符串包含非正規ASCII空格(十進制值32)以外的空格。因此,你需要一個匹配任何Unicode空格的正則表達式。很奇怪,簡單的gsub("[[:space:]]*°C", "", newtemp)不適用於所有R環境。

什麼通常工作是一個PCRE正則表達式:

gsub("(*UCP)\\s*°C", "", newtemp, perl=TRUE) 

這裏,(*UCP)是一個PCRE動詞做速記字符類的Unicode感知和\s可以匹配任何Unicode空格。參數perl=TRUE使得R使用PCRE正則表達式引擎而不是默認的TRE正則表達式引擎。

+0

偉大的解決方案!對於其他讀者:c2 a0 =使用UTF-8編碼的非間斷空間的字節表示(請參見http://www.utf8-zeichentabelle.de/) –

2

可以使用功能trimws

> x <- "-10.6 " 
> trimws(x) 
[1] "-10.6" 
> as.numeric(trimws(x)) 
[1] -10.6 

UPDATE

這似乎對你的情況下工作。

abc <- gsub("(^[-]\\d+\\.\\d+)(.*$)", "\\1", newtemp) 
data.frame(new = abc, old = newtemp) 
    new  old 
1 -10.6 -10.6 °C 
2 -10.6 -10.6 °C 
3 -11.0 -11.0 °C 
4 -10.6 -10.6 °C 
5 -10.6 -10.6 °C 
6 -10.6 -10.6 °C 
7 -10.6 -10.6 °C 
8 -10.6 -10.6 °C 
9 -10.6 -10.6 °C 
10 -10.6 -10.6 °C 
11 -10.6 -10.6 °C 
12 -10.6 -10.6 °C 
13 -10.6 -10.6 °C 
14 -10.6 -10.6 °C 
... 

一些事情發生在我頭上的字符「轉換」,所以我無法解釋發生了什麼問題。我解決這個問題的方式是忽略°C這個看起來有問題的部分,並且只使用regular expression來提取數字部分。在堅果殼中,我使用特殊字符查找減號([ - ]查找減號或無),數字(\ d +查找所有數字在一行中),點(\。)和句子結尾($),放組中的所有內容(使用()),然後提取第一組,將所有其他內容「留在」後面。

這裏的原始字符

> charToRaw(newtemp[1]) 
[1] 2d 31 30 2e 36 c2 a0 c2 b0 43 

,如果我複製/粘貼成R

> charToRaw("-10.6 °C") 
[1] 2d 31 30 2e 36 20 b0 43 

也許有人有更多的計算機技能,可以在發生了什麼事情在芯片上。

+0

它仍然沒有刪除空格仍然存在..請參考上面的代碼 –

+0

gsub表達正常。你能否說明它實際上做了什麼,以及我出錯的地方 –

+0

@RakshitSakhuja在這裏你走了。 –

0

您可以通過使用readr::parse_number避免整個問題:

library(rvest) 
library(tidyverse) 

url <- "https://www.wunderground.com/history/airport/KVAY/2015/2/17/DailyHistory.html?req_city=Cherry+Hill&req_state=NJ&req_statename=New+Jersey&reqdb.zip=08002&reqdb.magic=1&reqdb.wmo=99999&MR=1" 

h <- url %>% read_html() 

obs <- h %>% 
    html_node('#obsTable') %>% 
    html_table() 

obs_clean <- obs %>% 
    mutate(Temp. = parse_number(Temp.)) %>% 
    tbl_df() # for printing 

obs_clean 
#> # A tibble: 52 x 13 
#> `Time (EST)` Temp. Windchill `Dew Point` Humidity Pressure Visibility 
#>   <chr> <dbl>  <chr>  <chr> <chr> <chr>  <chr> 
#> 1  12:33 AM 12.9   -  8.1 °F  81% 30.08 in  2.0 mi 
#> 2  12:45 AM 12.9   -  8.1 °F  81% 30.05 in  1.2 mi 
#> 3  12:51 AM 12.2   -  8.6 °F  85% 30.05 in  0.8 mi 
#> 4  12:54 AM 12.9   -  9.0 °F  84% 30.06 in  0.8 mi 
#> 5  1:02 AM 12.9   -  9.0 °F  84% 30.05 in  0.5 mi 
#> 6  1:25 AM 12.9   -  9.0 °F  84% 30.02 in  1.0 mi 
#> 7  1:37 AM 12.9 6.9 °F  9.0 °F  84% 30.03 in  0.8 mi 
#> 8  1:54 AM 12.9 6.9 °F  9.0 °F  84% 30.02 in  0.8 mi 
#> 9  2:18 AM 12.9 5.2 °F  9.0 °F  84% 30.01 in  1.0 mi 
#> 10  2:40 AM 12.9 5.2 °F  9.0 °F  84% 29.99 in  1.0 mi 
#> # ... with 42 more rows, and 6 more variables: `Wind Dir` <chr>, `Wind 
#> # Speed` <chr>, `Gust Speed` <chr>, Precip <chr>, Events <chr>, 
#> # Conditions <chr> 

或正則表達式,

obs %>% mutate(Temp. = as.numeric(gsub('[\\W°FC]', '', Temp.))) %>% tbl_df() 

返回同樣的事情。