2017-09-22 57 views
1

我有一些樹的結構模型(如樹木與木材,而不是統計樹木)。這些模型可以用簡單的數據框表示,每行表示一個分支段,並且包含關於該段的長度和父行的信息。例如爲:如何高效地計算樹形拓撲中的路徑長度?

len parent_row 
.3 0 
.4 1 
.2 2 
.5 1 
... 

我需要計算的路徑長度的至少每一個尖端,但可能很容易地在每個「節點」或分支端。

我想知道是否有一個更有效的方式做到這一點比循環通過段?下面的例子足夠快地運行,但是當運行跨越數千個分段和數百棵樹時,需要數小時/天。我嘗試遞歸,雖然優雅,但速度非常緩慢。更好的解決方案比我的下面非常感謝!

感謝, 艾莉

calc_pathlen <- function(tree_structure) { 
    tree_structure[1,]$path_len = tree_structure[1,]$len 
    for (this_row in 2:nrow(tree_structure)) { 
    prev_len = tree_structure[ tree_structure[this_row,]$parent_row, ]$path_len 
    tree_structure[this_row,]$path_len = tree_structure[this_row,]$len + prev_len 
    } 
    return(tree_structure) 
} 

treestr = 
structure(list(len = c(2.3083, 1.632, 1.5825, 1.5592, 1.4847, 
1.367, 1.3112, 1.3211, 1.4324, 1.295, 1.1503, 0.989, 0.8525, 
0.9726, 0.8982, 0.8836, 0.8605, 0.7188, 0.5755, 0.5589, 0.6966, 
0.6531, 0.6741, 0.7582, 0.817, 0.7074, 0.7248, 0.5725, 0.6796, 
0.405, 0.5567, 0.6619, 0.473, 0.3485, 0.3946, 0.3593, 0.6572, 
0.3869, 0.4623, 3.4597, 0.5878, 0.7538, 0.3745, 2.3949, 0.3239, 
2.2521, 0.7563, 6.6197, 1.026, 0.9864, 0.6527, 0.6708, 0.4002, 
0.5858, 1.3728, 0.4469, 0.3309, 0.7075, 0.7637, 0.8429, 0.9874, 
0.1311, 0.3487, 1.037, 0.9087, 0.8871, 1.0236, 0.4481, 0.5124, 
0.4371, 0.4766, 0.4553, 0.5172, 0.4781, 0.5084, 0.4263, 0.3542, 
0.3062, 0.2901, 0.2042, 0.3126, 0.2366, 0.2714, 0.2741, 0.3511, 
0.1613, 0.3028, 0.1689, 0.0417, 0.2132, 0.1269, 0.2181, 0.1868, 
0.1553, 0.1151, 0.1602, 0.1786, 0.1649, 0.1967, 0.1584, 0.1366, 
0.13, 0.1722, 0.1725, 0.111, 0.1752, 0.1306, 0.1582, 0.0942, 
0.041, 0.356, 0.1958, 0.1212, 0.4129, 0.3499, 0.1706, 0.4647, 
0.272, 0.4036, 0.0949, 0.1161, 0.1271, 0.2153, 0.1685, 0.181, 
0.1302, 0.1229, 0.1146, 0.1393, 0.1722, 0.2033, 0.099, 0.1988, 
0.1264, 0.1377, 0.1634, 0.2223, 0.1466, 0.1376, 0.1788, 0.1946, 
0.1146, 0.2983, 0.1176, 0.0951, 0.1279, 0.095, 0.1909, 0.162, 
0.1562, 0.1225, 0.1471, 0.1976, 0.2194, 0.1256, 0.2105, 0.1147, 
0.1855, 0.1011, 0.1735, 0.1254, 0.1793, 0.8308, 0.2481, 0.2479, 
0.1788, 0.2907, 0.2904, 0.3411, 0.224, 0.2298, 0.1942, 0.1872, 
0.3311, 0.2295, 0.1466, 4.4547, 0.553, 0.2382, 0.2559, 0.2991, 
0.3763, 0.117, 0.2036, 0.1795, 0.1933, 0.1765, 0.3909, 0.4619, 
2.5901, 0.2655, 0.5306, 0.6369, 0.478, 2.5033, 0.4954, 0.2455, 
0.3417, 0.0605, 0.4264, 0.4569, 0.3266, 0.0666, 0.2642, 0.3017, 
0.0154, 0.3844, 0.2846, 0.2823, 0.3533, 0.1624, 0.1551, 0.1903, 
0.169, 0.1362, 0.1714, 0.1899, 0.1539, 0.1477, 0.1815, 0.2597, 
0.1302, 0.225, 0.3408, 0.2535, 0.521, 0.2328, 0.2115, 0.2387, 
0.1519, 0.1885, 0.1678, 0.1398, 0.3438, 0.458, 0.197, 0.2821, 
0.3694, 0.2039, 0.1518, 0.2985, 1.2251, 0.3328, 0.1593, 0.2945, 
0.1412, 0.2308, 0.2833, 2.6558, 0.3388, 0.3979, 0.2965, 0.2548, 
3.4452, 0.1267, 0.1801, 0.1494, 0.4831, 2.552, 0.1391, 0.3995, 
0.2753, 0.4019, 0.39, 0.7409, 0.5362, 0.7483, 1.6991, 0.2951, 
0.389, 0.7077, 0.7083, 0.6922, 0.5679, 0.2076, 0.2729, 0.3376, 
0.2368, 0.1639, 0.1788, 0.1885, 0.2328, 0.2057, 0.2091, 0.2673, 
0.1848, 0.1389, 0.1879, 0.1537, 0.3528, 0.2515, 0.2249, 0.259, 
0.1838, 0.2231, 0.228, 0.1753, 0.267, 0.2278, 0.257), parent_row = c(0L, 
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 
15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 
28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 
41L, 42L, 43L, 44L, 45L, 46L, 47L, 48L, 49L, 50L, 51L, 52L, 53L, 
54L, 55L, 56L, 57L, 58L, 59L, 60L, 8L, 11L, 63L, 64L, 65L, 66L, 
67L, 68L, 69L, 70L, 71L, 72L, 73L, 74L, 75L, 76L, 77L, 78L, 79L, 
80L, 81L, 82L, 83L, 84L, 17L, 86L, 87L, 88L, 89L, 90L, 91L, 92L, 
93L, 94L, 95L, 96L, 97L, 98L, 99L, 100L, 101L, 102L, 103L, 104L, 
105L, 106L, 107L, 108L, 109L, 110L, 111L, 112L, 113L, 114L, 115L, 
116L, 117L, 118L, 119L, 120L, 121L, 122L, 123L, 124L, 125L, 126L, 
127L, 128L, 129L, 130L, 131L, 132L, 133L, 134L, 135L, 136L, 137L, 
138L, 139L, 140L, 141L, 142L, 143L, 144L, 145L, 146L, 147L, 148L, 
149L, 150L, 151L, 152L, 153L, 154L, 155L, 156L, 157L, 158L, 159L, 
160L, 161L, 162L, 163L, 164L, 165L, 166L, 167L, 168L, 169L, 170L, 
171L, 172L, 173L, 174L, 175L, 176L, 177L, 178L, 179L, 180L, 181L, 
182L, 183L, 184L, 185L, 186L, 187L, 188L, 189L, 190L, 191L, 192L, 
193L, 194L, 195L, 13L, 197L, 13L, 199L, 16L, 201L, 17L, 203L, 
204L, 19L, 206L, 207L, 22L, 209L, 210L, 211L, 212L, 213L, 214L, 
215L, 216L, 217L, 218L, 219L, 220L, 221L, 222L, 223L, 224L, 225L, 
226L, 227L, 228L, 229L, 230L, 231L, 232L, 233L, 234L, 235L, 236L, 
237L, 238L, 239L, 240L, 241L, 242L, 243L, 244L, 245L, 246L, 247L, 
248L, 249L, 250L, 251L, 252L, 253L, 254L, 255L, 256L, 257L, 258L, 
259L, 260L, 261L, 262L, 263L, 264L, 265L, 266L, 267L, 268L, 269L, 
270L, 271L, 272L, 273L, 22L, 275L, 276L, 277L, 278L, 279L, 280L, 
281L, 282L, 283L, 284L, 22L, 286L, 287L, 23L, 289L, 290L, 291L, 
292L, 293L, 294L, 295L, 296L, 297L, 298L, 299L), path_len = c(NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), .Names = c("len", 
"parent_row", "path_len"), row.names = c(NA, 300L), class = "data.frame") 

calc_pathlen(treestr) 

回答

0

它是Rcpp理想的情況。

Rcpp::cppFunction(' 

    NumericVector calcTreeLength(NumericVector len, NumericVector idx){ 
    int n = len.size(); 
    NumericVector res(n); 
    double cumsum=0; 
    int j; 
    res[0] = len[0]; 

    for(int i = 1; i < n; i++){ 
     j = idx[ i ] - 1; 
     cumsum = res[ j ] + len[ i ]; 
     res[ i ] = cumsum; 
    } 
    return res; 
}') 

運行兩個函數比較結果

treestr <- calc_pathlen(treestr) 
treestr$path_len_cpp <- calcTreeLength(treestr$len, treestr$parent_row) 

sum(treestr$path_len_cpp!=treestr$path_len) 
#[1] 0 

速度相差約〜3000倍

microbenchmark::microbenchmark(
    calc_pathlen(treestr), 
    calcTreeLength(treestr$len, treestr$parent_row) 
) 

enter image description here

享受!