2016-09-25 55 views
0
library(XML) 
file <-"E:/aaa.xml" 
doc = xmlInternalTreeParse(file) 
ns=names(xmlNamespace(xmlRoot(doc))) 
patient=getNodeSet(doc, path=paste("/", ns, ":tcga_bcr/", ns,":patient", sep="")) 
row=xmlToDataFrame(nodes=patient, stringsAsFactors = F) 

shared_stage:stage_event有許多子節點,如何將每個子節點精確定位爲列。如何讀取R中的xml文件和data.frame

如果節點具有preferred_name,請使用preferred_name作爲data.frame列名稱。

aaa.xml:

<?xml version="1.0" encoding="UTF-8"?> 
<brca:tcga_bcr xsi:schemaLocation="http://tcga.nci/bcr/xml/clinical/brca/2.7 http://tcga-data.nci.nih.gov/docs/xsd/BCR/tcga.nci/bcr/xml/clinical/brca/2.7/TCGA_BCR.BRCA_Clinical.xsd" schemaVersion="2.7" xmlns:brca="http://tcga.nci/bcr/xml/clinical/brca/2.7" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:admin="http://tcga.nci/bcr/xml/administration/2.7" xmlns:clin_shared="http://tcga.nci/bcr/xml/clinical/shared/2.7" xmlns:shared="http://tcga.nci/bcr/xml/shared/2.7" xmlns:brca_shared="http://tcga.nci/bcr/xml/clinical/brca/shared/2.7" xmlns:shared_stage="http://tcga.nci/bcr/xml/clinical/shared/stage/2.7" xmlns:brca_nte="http://tcga.nci/bcr/xml/clinical/brca/shared/new_tumor_event/2.7/1.0" xmlns:nte="http://tcga.nci/bcr/xml/clinical/shared/new_tumor_event/2.7" xmlns:follow_up_v2.1="http://tcga.nci/bcr/xml/clinical/brca/followup/2.7/2.1" xmlns:rx="http://tcga.nci/bcr/xml/clinical/pharmaceutical/2.7" xmlns:rad="http://tcga.nci/bcr/xml/clinical/radiation/2.7"> 
<brca:patient> 
    <admin:additional_studies/> 
    <clin_shared:tumor_tissue_site preferred_name="submitted_tumor_site" display_order="9999" cde="3427536" cde_ver="2.000" xsd_ver="2.6" tier="2" owner="TSS" procurement_status="Completed" restricted="false" source_system_identifier="175314">Breast</clin_shared:tumor_tissue_site> 
    <clin_shared:race_list> 
     <clin_shared:race preferred_name="race" display_order="12" cde="2192199" cde_ver="1.000" xsd_ver="1.8" tier="2" owner="TSS" procurement_status="Completed" restricted="false" source_system_identifier="175301">WHITE</clin_shared:race> 
    </clin_shared:race_list> 
    <shared:bcr_patient_barcode preferred_name="" display_order="9999" cde="2673794" cde_ver="" xsd_ver="1.8" owner="TSS" procurement_status="Completed" restricted="false">TCGA-A2-A0EV</shared:bcr_patient_barcode> 
    <shared:tissue_source_site cde="" cde_ver="" xsd_ver="2.4" owner="TSS" procurement_status="Completed" restricted="false">A2</shared:tissue_source_site> 
    <shared_stage:stage_event system="AJCC"> 
     <shared_stage:system_version preferred_name="ajcc_staging_edition" display_order="51" cde="2722309" cde_ver="1.000" xsd_ver="2.6" tier="1" owner="TSS" procurement_status="Completed" restricted="false" source_system_identifier="1080001">6th</shared_stage:system_version> 
     <shared_stage:tnm_categories> 
      <shared_stage:pathologic_categories> 
       <shared_stage:pathologic_T preferred_name="ajcc_tumor_pathologic_pt" display_order="52" cde="3045435" cde_ver="1.000" xsd_ver="2.6" tier="1" owner="TSS" procurement_status="Completed" restricted="false" source_system_identifier="175336">T1c</shared_stage:pathologic_T> 
      </shared_stage:pathologic_categories> 
     </shared_stage:tnm_categories> 
    </shared_stage:stage_event>  
    <rx:drugs/> 
    <rad:radiations/> 
</brca:patient> 
</brca:tcga_bcr> 

data.frame

submitted_tumor_site race bcr_patient_barcode ajcc_staging_edition ajcc_tumor_pathologic_pt 
Breast    WHITE TCGA-A2-A0EV   6th    T1c 

回答

1

既然你有嵌套的後代和不同的命名空間,可以考慮只運行的XPath給每個需要的XML值。然後將它們綁定到一個數據框中。外lapply()跨越了checkpath()功能brca:patient節點的數量運行,以考慮可能的失蹤兒童或後代節點:

patientnum <- 1:length(xpathSApply(doc, "//brca:patient")) 

checkpath <- function(xpath){ 
    val <- ifelse(length(xpath) > 0, xpath[[1]], NA) 
} 

patientdata <- lapply(patientnum, function(i){ 
    temp <- c(checkpath(xpathSApply(doc, paste0("//brca:patient[",i,"]/clin_shared:tumor_tissue_site"), xmlValue)), 
      checkpath(xpathSApply(doc, paste0("//brca:patient[",i,"]/descendant::clin_shared:race"), xmlValue)), 
      checkpath(xpathSApply(doc, paste0("//brca:patient[",i,"]/descendant::shared:bcr_patient_barcode"), xmlValue)), 
      checkpath(xpathSApply(doc, paste0("//brca:patient[",i,"]/descendant::shared_stage:system_version"), xmlValue)), 
      checkpath(xpathSApply(doc, paste0("//brca:patient[",i,"]/descendant::shared_stage:pathologic_T"), xmlValue))) 

    temp <- setNames(temp, c("tumor_tissue_site", "race", "bcr_patient_barcode", "system_version", "pathologic_T")) 
}) 

patients <- do.call(rbind, patientdata) 
patients <- data.frame(patients, stringsAsFactors = FALSE) 

另外,您還可以使用xmlToDataFrame()但需要扁平化和簡化您的XML這可以用XSLT(XML轉換語言和與XPath同級)來完成。

雖然R沒有XSLT的專用通用庫,但您可以使用外部處理器,包括其他語言(Python,Java,PHP,甚至Excel VBA),專用.exe(Saxon,Xalan)或命令在線解釋器(PowerShell,Bash)。和R可以調用每一個與system()

XSLT腳本

<xsl:transform xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0" 
       xmlns:brca="http://tcga.nci/bcr/xml/clinical/brca/2.7" 
       xmlns:clin_shared="http://tcga.nci/bcr/xml/clinical/shared/2.7" 
       xmlns:shared="http://tcga.nci/bcr/xml/shared/2.7"    
       xmlns:shared_stage="http://tcga.nci/bcr/xml/clinical/shared/stage/2.7"> 
<xsl:output version="1.0" encoding="UTF-8" indent="yes" /> 
<xsl:strip-space elements="*"/> 

    <xsl:template match="/brca:tcga_bcr"> 
    <xsl:element name="{local-name()}"> 
     <xsl:apply-templates select="brca:patient"/> 
    </xsl:element> 
    </xsl:template>  

    <xsl:template match="brca:patient">  
    <xsl:element name="{local-name()}"> 
     <tumor_tissue_site><xsl:value-of select="clin_shared:tumor_tissue_site"/></tumor_tissue_site> 
     <race><xsl:value-of select="descendant::clin_shared:race"/></race> 
     <bcr_patient_barcode><xsl:value-of select="descendant::shared:bcr_patient_barcode"/></bcr_patient_barcode> 
     <system_version><xsl:value-of select="descendant::shared_stage:system_version"/></system_version> 
     <pathologic_T><xsl:value-of select="descendant::shared_stage:pathologic_T"/></pathologic_T> 
    </xsl:element> 
    </xsl:template> 

</xsl:transform> 

[R腳本

system("command line call to transform xml source with xslt") 
# system('python "path/to/transformation_script.py"')   ' EXAMPLE: PYTHON SCRIPT 

doc <- xmlParse("path/to/transformed.xml") 
doc 
# <?xml version="1.0" encoding="UTF-8"?> 
# <tcga_bcr> 
# <patient> 
#  <tumor_tissue_site>Breast</tumor_tissue_site> 
#  <race>WHITE</race> 
#  <bcr_patient_barcode>TCGA-A2-A0EV</bcr_patient_barcode> 
#  <system_version>6th</system_version> 
#  <pathologic_T>T1c</pathologic_T> 
# </patient> 
# </tcga_bcr> 

patients <- xmlToDataFrame(nodes = getNodeSet(doc, "//patient"), stringsAsFactors = FALSE) 
0
doc = xmlInternalTreeParse(file) 
ns=names(xmlNamespace(xmlRoot(doc))) 
patient=getNodeSet(doc, path=paste("/", ns, ":tcga_bcr/", ns,":patient", sep="")) 

patient.fields=xmlChildren(patient[[1]]) 
patient.fields[[2]] 

結果是

<clin_shared:tumor_tissue_site preferred_name="submitted_tumor_site" display_order="9999" cde="3427536" cde_ver="2.000" xsd_ver="2.6" tier="2" owner="TSS" procurement_status="Completed" restricted="false" source_system_identifier="175314">Breast</clin_shared:tumor_tissue_site> 

如何提取patient.fields [[2]]中preferred_name的內容?