2016-11-29 39 views
4

基於R工作室1.0.44上的MacOS 10.12.1將XML文件轉換爲R數據幀

這是處理XML文件,我有相當多的麻煩我第一次.. 我想把我的xml文件到一個R的數據幀,看起來是這樣的:

Date househouldid memberid session begin end 
11/1/15 4FDYT14A1Q    1  235 49448 49453 
11/1/15 4FDYT14A1Q    1 1010001 49453 49463 
11/1/15 4FDYT14A1Q    1 1010475 49463 49468 

我只想要日期,householdid,MEMBERID,會議,開始和數據幀結束。

我想...

xmldataframe<-xmlToDataFrame("DQA_audxml_20150105_EN.xml") 
print(xmldataframe) 

,並得到這個...

目的地成員regular_guests occasional_guests tvsets household_process

下面是XML文件的一部分(我受字符數限制)。

<?xml version="1.0" encoding="UTF-8"?> 
<households date="2015-01-05" creation_date="2015-01-11T04:22:35" xmlns:s="http://www.mediametrie.fr/nge/schemas" xmlns:xalan="http://xml.apache.org/xalan"> 
    <household id="95D7N14AFA"> 
    <destinations /> 
    <members> 
     <member id="1"> 
     <member_process result="OK" vacation="undefined"> 
      <individual_audience /> 
      <alarms /> 
     </member_process> 
     </member> 
     <member id="2"> 
     <member_process result="OK" vacation="undefined"> 
      <individual_audience /> 
      <alarms /> 
     </member_process> 
     </member> 
     <member id="3"> 
     <member_process result="OK" vacation="undefined"> 
      <individual_audience /> 
      <alarms /> 
     </member_process> 
     </member> 
    </members> 
    <regular_guests /> 
    <occasional_guests /> 
    <tvsets> 
     <tvset id="1"> 
     <tvset_process result="OK"> 
      <tvset_audience /> 
      <alarms> 
      <alarm id="AL_T_P_VALID_LAST_HOUR_REBOOT" rule_id="T_P_METER_STOPPING_TIMESTAMPING"> 
       <parameters> 
       <parameter name="unique_id" value="149" /> 
       <parameter name="reboot_date" value="2015-01-05T03:34:23" /> 
       <parameter name="length" value="0.35116898148148146" /> 
       </parameters> 
      </alarm> 
      <alarm id="AL_T_P_VALID_LAST_HOUR_REBOOT" rule_id="T_P_METER_STOPPING_TIMESTAMPING"> 
       <parameters> 
       <parameter name="unique_id" value="150" /> 
       <parameter name="reboot_date" value="2015-01-05T05:28:34" /> 
       <parameter name="length" value="0.02806712962962963" /> 
       </parameters> 
      </alarm> 
      <alarm id="AL_T_P_VALID_LAST_HOUR_REBOOT" rule_id="T_P_METER_STOPPING_TIMESTAMPING"> 
       <parameters> 
       <parameter name="unique_id" value="151" /> 
       <parameter name="reboot_date" value="2015-01-05T08:22:46" /> 
       <parameter name="length" value="0.07322916666666666" /> 
       </parameters> 
      </alarm> 
      <alarm id="AL_T_P_VALID_LAST_HOUR_REBOOT" rule_id="T_P_METER_STOPPING_TIMESTAMPING"> 
       <parameters> 
       <parameter name="unique_id" value="152" /> 
       <parameter name="reboot_date" value="2015-01-05T14:32:43" /> 
       <parameter name="length" value="0.004803240740740742" /> 
       </parameters> 
      </alarm> 
      <alarm id="AL_T_P_VALID_LAST_HOUR_REBOOT" rule_id="T_P_METER_STOPPING_TIMESTAMPING"> 
       <parameters> 
       <parameter name="unique_id" value="153" /> 
       <parameter name="reboot_date" value="2015-01-05T18:30:47" /> 
       <parameter name="length" value="0.0" /> 
       </parameters> 
      </alarm> 
      <alarm id="AL_T_P_VALID_LAST_HOUR_REBOOT" rule_id="T_P_METER_STOPPING_TIMESTAMPING"> 
       <parameters> 
       <parameter name="unique_id" value="154" /> 
       <parameter name="reboot_date" value="2015-01-06T02:52:01" /> 
       <parameter name="length" value="0.33817129629629633" /> 
       </parameters> 
      </alarm> 
      </alarms> 
     </tvset_process> 
     </tvset> 
    </tvsets> 
    <household_process result="OK" vacation="no"> 
     <alarms /> 
    </household_process> 
    </household> 
    <household id="ISDPN14ASG"> 
    <destinations /> 
    <members> 
     <member id="1"> 
     <member_process result="OK" vacation="undefined"> 
      <individual_audience /> 
      <alarms /> 
     </member_process> 
     </member> 
     <member id="2"> 
     <member_process result="OK" vacation="undefined"> 
      <individual_audience /> 
      <alarms /> 
     </member_process> 
     </member> 
     <member id="3"> 
     <member_process result="OK" vacation="undefined"> 
      <individual_audience /> 
      <alarms /> 
     </member_process> 
     </member> 
     <member id="4"> 
     <member_process result="OK" vacation="undefined"> 
      <individual_audience /> 
      <alarms /> 
     </member_process> 
     </member> 
     <member id="5"> 
     <member_process result="OK" vacation="undefined"> 
      <individual_audience /> 
      <alarms /> 
     </member_process> 
     </member> 
    </members> 
    <regular_guests /> 
    <occasional_guests /> 
    <tvsets> 
     <tvset id="1"> 
     <tvset_process result="KO"> 
      <tvset_audience /> 
      <alarms> 
      <alarm id="AL_R_P_EVENT_CHRONOLOGY" rule_id="R_P_EVENT_CHRONOLOGY"> 
       <parameters> 
       <parameter name="unique_id" value="16195" /> 
       </parameters> 
      </alarm> 
      <alarm id="AL_PROCESS_STOP" rule_id="R_P_EVENT_CHRONOLOGY"> 
       <parameters /> 
      </alarm> 
      </alarms> 
     </tvset_process> 
     </tvset> 
    </tvsets> 
    <household_process result="KO" vacation="no"> 
     <alarms> 
     <alarm id="AL_T_FP_HOUSEHOLD_ABS_DECLARATION" rule_id="T_FP_HOUSEHOLD_ABS_DECLARATION"> 
      <parameters /> 
     </alarm> 
     <alarm id="AL_R_FP_NO_TVSET_KO" rule_id="R_FP_NO_TVSET_KO"> 
      <parameters> 
      <parameter name="tvset_id" value="1" /> 
      </parameters> 
     </alarm> 
     <alarm id="AL_R_FP_AT_LEAST_ONE_TVSET_OK" rule_id="R_FP_AT_LEAST_ONE_TVSET_OK"> 
      <parameters /> 
     </alarm> 
     </alarms> 
    </household_process> 
    </household> 
    <household id="4FDYT14A1Q"> 
    <destinations /> 
    <members> 
     <member id="1"> 
     <member_process result="OK" vacation="undefined"> 
      <individual_audience /> 
      <alarms /> 
     </member_process> 
     </member> 
     <member id="2"> 
     <member_process result="OK" vacation="undefined"> 
      <individual_audience> 
      <individual_audience_tvset id="1"> 
       <channel session="1010497" begin="46722" end="46738" /> 
       <channel session="48" begin="46738" end="46743" /> 
       <channel session="235" begin="46743" end="46748" /> 
       <channel session="47" begin="46748" end="46753" /> 
       <channel session="1010383" begin="46753" end="46973" /> 
       <channel session="1010497" begin="46973" end="46983" /> 
       <channel session="1010319" begin="46983" end="46993" /> 
       <channel session="1010072" begin="46993" end="47578" /> 
       <channel session="1010499" begin="47578" end="47618" /> 
       <channel session="47" begin="47618" end="47633" /> 
       <channel session="1010383" begin="47633" end="47693" /> 
       <channel session="48" begin="47693" end="47703" /> 
       <channel session="235" begin="47703" end="47728" /> 
       <channel session="1010469" begin="47728" end="47778" /> 
       <channel session="235" begin="47778" end="47783" /> 
       <channel session="48" begin="47783" end="47828" /> 
       <channel session="1010367" begin="47828" end="47853" /> 
       <channel session="48" begin="47853" end="48373" /> 
       <channel session="1010383" begin="48373" end="48573" /> 
       <channel session="48" begin="48573" end="49433" /> 
       <channel session="1010452" begin="49433" end="49448" /> 
       <channel session="235" begin="49448" end="49453" /> 
       <channel session="1010001" begin="49453" end="49463" /> 
       <channel session="1010475" begin="49463" end="49468" /> 
       <channel session="1010383" begin="49468" end="49483" /> 
       <channel session="48" begin="49483" end="49508" /> 
       <channel session="1010403" begin="49508" end="49543" /> 
       <channel session="48" begin="49543" end="53839" /> 
       <channel session="1010354" begin="60868" end="60878" /> 
       <channel session="235" begin="60878" end="60893" /> 
       <channel session="48" begin="60893" end="60908" /> 
       <channel session="235" begin="60908" end="60913" /> 
       <channel session="1010072" begin="60913" end="60928" /> 
       <channel session="48" begin="60928" end="60933" /> 
       <channel session="235" begin="60933" end="60938" /> 
       <channel session="48" begin="60938" end="60943" /> 
       <channel session="235" begin="60943" end="60953" /> 
       <channel session="48" begin="60953" end="60968" /> 
       <channel session="235" begin="60968" end="60993" /> 
       <channel session="1010072" begin="60993" end="61483" /> 
       <channel session="1010499" begin="61483" end="61538" /> 
       <channel session="48" begin="61538" end="61543" /> 
       <channel session="47" begin="61543" end="61578" /> 
       <channel session="1010383" begin="61578" end="61603" /> 
       <channel session="1010072" begin="61603" end="63428" /> 
       <channel session="1010497" begin="63428" end="63438" /> 
       <channel session="1010319" begin="63438" end="63888" /> 
       <channel session="1010072" begin="63888" end="63898" /> 
       <channel session="235" begin="63898" end="63903" /> 
       <channel session="47" begin="63903" end="64368" /> 
       <channel session="48" begin="64368" end="64373" /> 
       <channel session="47" begin="64373" end="64378" /> 
       <channel session="1010072" begin="64378" end="64393" /> 
       <channel session="1010383" begin="64393" end="64408" /> 
       <channel session="1010001" begin="64408" end="64423" /> 
       <channel session="235" begin="64423" end="64428" /> 
       <channel session="1010469" begin="64428" end="64468" /> 
       <channel session="1010452" begin="64468" end="64473" /> 
       <channel session="48" begin="64473" end="64528" /> 
       <channel session="1010367" begin="64528" end="65053" /> 
       <channel session="48" begin="65053" end="65328" /> 
       <channel session="1010367" begin="65328" end="66613" /> 
       <channel session="1010383" begin="66613" end="66628" /> 
       <channel session="1010497" begin="66628" end="66823" /> 
       <channel session="47" begin="66823" end="66833" /> 
       <channel session="1010367" begin="66833" end="67538" /> 
       <channel session="1010383" begin="67538" end="67553" /> 
       <channel session="1010354" begin="67553" end="67563" /> 
       <channel session="1010497" begin="67563" end="67658" /> 
       <channel session="1010499" begin="67658" end="67713" /> 
       <channel session="1010072" begin="67713" end="67733" /> 
       <channel session="1010367" begin="67733" end="67803" /> 
       <channel session="48" begin="67803" end="68018" /> 
       <channel session="1010367" begin="68018" end="70963" /> 
       <channel session="1010383" begin="70963" end="71048" /> 
       <channel session="1010367" begin="71048" end="71848" /> 
       <channel session="1010383" begin="71848" end="72023" /> 
       <channel session="1010497" begin="72023" end="72043" /> 
       <channel session="1010072" begin="72043" end="72068" /> 
       <channel session="1010499" begin="72068" end="72083" /> 
       <channel session="47" begin="72083" end="72098" /> 
       <channel session="1010499" begin="72098" end="73643" /> 
       <channel session="1010072" begin="73643" end="73668" /> 
       <channel session="1010354" begin="73668" end="73683" /> 
       <channel session="1010383" begin="73683" end="73693" /> 
       <channel session="1010499" begin="73693" end="73708" /> 
       <channel session="1010072" begin="73708" end="73748" /> 
       <channel session="235" begin="73748" end="73753" /> 
       <channel session="1010497" begin="73753" end="73763" /> 
       <channel session="1010499" begin="73763" end="74323" /> 
       <channel session="1010497" begin="74323" end="74363" /> 
       <channel session="1010354" begin="74363" end="74378" /> 
       <channel session="1010499" begin="74378" end="74443" /> 
       <channel session="1010072" begin="74443" end="74453" /> 
       <channel session="1010499" begin="74453" end="74883" /> 
       <channel session="1010497" begin="74883" end="74938" /> 
       <channel session="1010072" begin="74938" end="74953" /> 
       <channel session="47" begin="74953" end="75603" /> 
       <channel session="1010497" begin="75603" end="77673" /> 
       <channel session="1010383" begin="77673" end="79208" /> 
       <channel session="1010354" begin="79208" end="79228" /> 
       <channel session="235" begin="79228" end="79233" /> 
       <channel session="1010497" begin="79233" end="79603" /> 
       <channel session="1010072" begin="79603" end="80088" /> 
       <channel session="1010497" begin="80088" end="80238" /> 
       <channel session="1010499" begin="80238" end="80253" /> 
       <channel session="235" begin="80253" end="80258" /> 
       <channel session="47" begin="80258" end="80423" /> 
       <channel session="1010072" begin="80423" end="80678" /> 
       <channel session="1010497" begin="80678" end="80688" /> 
       <channel session="47" begin="80688" end="80853" /> 
       <channel session="235" begin="80853" end="80858" /> 
       <channel session="1010354" begin="80858" end="80868" /> 
       <channel session="1010497" begin="80868" end="81428" /> 
       <channel session="1010072" begin="81428" end="81438" /> 
       <channel session="235" begin="81438" end="81448" /> 
       <channel session="47" begin="81448" end="81453" /> 
       <channel session="1010497" begin="81453" end="82108" /> 
       <channel session="1010354" begin="82108" end="82528" /> 
       <channel session="1010497" begin="82528" end="82538" /> 
       <channel session="1010072" begin="82538" end="82548" /> 
       <channel session="1010499" begin="82548" end="82608" /> 
       <channel session="47" begin="82608" end="82628" /> 
       <channel session="1010072" begin="82628" end="82648" /> 
       <channel session="1010354" begin="82648" end="82973" /> 
       <channel session="1010072" begin="82973" end="82983" /> 
       <channel session="1010499" begin="82983" end="83273" /> 
       <channel session="47" begin="83273" end="83313" /> 
       <channel session="1010383" begin="83313" end="83393" /> 
       <channel session="1010319" begin="83393" end="83443" /> 
       <channel session="1010499" begin="83443" end="83498" /> 
       <channel session="48" begin="83498" end="83688" /> 
       <channel session="47" begin="83688" end="83883" /> 
       <channel session="48" begin="83883" end="83898" /> 
       <channel session="1010258" begin="83898" end="84193" /> 
       <channel session="47" begin="84193" end="84198" /> 
       <channel session="1010260" begin="84198" end="84253" /> 
       <channel session="235" begin="84253" end="84258" /> 
       <channel session="47" begin="84258" end="84273" /> 
       <channel session="48" begin="84273" end="84423" /> 
       <channel session="235" begin="84423" end="84433" /> 
       <channel session="48" begin="84433" end="84448" /> 
       <channel session="1010260" begin="84448" end="84463" /> 
       <channel session="1010144" begin="84463" end="84578" /> 
       <channel session="235" begin="84578" end="84583" /> 
       <channel session="1010260" begin="84583" end="84628" /> 
       <channel session="235" begin="84628" end="84633" /> 
       <channel session="1010258" begin="84633" end="84643" /> 
       <channel session="48" begin="84643" end="85088" /> 
       <channel session="235" begin="85088" end="85098" /> 
       <channel session="1010258" begin="85098" end="85113" /> 
       <channel session="235" begin="85113" end="85118" /> 
       <channel session="1010144" begin="85118" end="85163" /> 
       <channel session="1010260" begin="85163" end="85178" /> 
       <channel session="48" begin="85178" end="85398" /> 
       <channel session="1010493" begin="85398" end="85418" /> 
       <channel session="48" begin="85418" end="85428" /> 
       <channel session="235" begin="85428" end="85433" /> 
       <channel session="47" begin="85433" end="85488" /> 
       <channel session="1010260" begin="85488" end="85493" /> 
       <channel session="1010258" begin="85493" end="85638" /> 
       <channel session="1010474" begin="85638" end="85785" /> 
      </individual_audience_tvset> 
      </individual_audience> 
      <alarms /> 
     </member_process> 
     </member> 
     <member id="3"> 
     <member_process result="OK" vacation="undefined"> 
      <individual_audience> 
      <individual_audience_tvset id="1"> 
       <channel session="1010493" begin="54476" end="54490" /> 
       <channel session="47" begin="54490" end="55035" /> 
       <channel session="1010379" begin="55035" end="55210" /> 
       <channel session="48" begin="55210" end="55220" /> 
       <channel session="1010397" begin="55220" end="55570" /> 
       <channel session="235" begin="55570" end="55580" /> 
       <channel session="1010425" begin="55580" end="55585" /> 
       <channel session="1010397" begin="55585" end="55620" /> 
       <channel session="235" begin="55620" end="55625" /> 
       <channel session="48" begin="55625" end="56875" /> 
       <channel session="1010379" begin="56875" end="57345" /> 
       <channel session="48" begin="57345" end="57615" /> 
       <channel session="1010379" begin="57615" end="57865" /> 
       <channel session="48" begin="57865" end="58079" /> 
      </individual_audience_tvset> 
      </individual_audience> 
      <alarms /> 
     </member_process> 
     </member> 
     <member id="4"> 
     <member_process result="OK" vacation="undefined"> 
      <individual_audience /> 
      <alarms /> 
     </member_process> 
     </member> 
    </members> 
    <regular_guests /> 
    <occasional_guests /> 
    <tvsets> 
     <tvset id="1"> 
     <tvset_process result="OK"> 
      <tvset_audience> 
      <channel session="1010497" begin="46708" end="46738" /> 
      <channel session="48" begin="46738" end="46743" /> 
      <channel session="235" begin="46743" end="46748" /> 
      <channel session="47" begin="46748" end="46753" /> 
      <channel session="1010383" begin="46753" end="46973" /> 
      <channel session="1010497" begin="46973" end="46983" /> 
      <channel session="1010319" begin="46983" end="46993" /> 
      <channel session="1010072" begin="46993" end="47578" /> 
      <channel session="1010499" begin="47578" end="47618" /> 
      <channel session="47" begin="47618" end="47633" /> 
      <channel session="1010383" begin="47633" end="47693" /> 
      <channel session="48" begin="47693" end="47703" /> 
      <channel session="235" begin="47703" end="47728" /> 
      <channel session="1010469" begin="47728" end="47778" /> 
      <channel session="235" begin="47778" end="47783" /> 
      <channel session="48" begin="47783" end="47828" /> 
      <channel session="1010367" begin="47828" end="47853" /> 
      <channel session="48" begin="47853" end="48373" /> 
      <channel session="1010383" begin="48373" end="48573" /> 
      <channel session="48" begin="48573" end="49433" /> 
      <channel session="1010452" begin="49433" end="49448" /> 
      <channel session="235" begin="49448" end="49453" /> 
      <channel session="1010001" begin="49453" end="49463" /> 
      <channel session="1010475" begin="49463" end="49468" /> 
      <channel session="1010383" begin="49468" end="49483" /> 
      <channel session="48" begin="49483" end="49508" /> 
      <channel session="1010403" begin="49508" end="49543" /> 
      <channel session="48" begin="49543" end="53839" /> 
      <channel session="1010493" begin="54460" end="54490" /> 
      <channel session="47" begin="54490" end="55035" /> 
      <channel session="1010379" begin="55035" end="55210" /> 
      <channel session="48" begin="55210" end="55220" /> 
      <channel session="1010397" begin="55220" end="55570" /> 
      <channel session="235" begin="55570" end="55580" /> 
      <channel session="1010425" begin="55580" end="55585" /> 
      <channel session="1010397" begin="55585" end="55620" /> 
      <channel session="235" begin="55620" end="55625" /> 
      <channel session="48" begin="55625" end="56875" /> 
      <channel session="1010379" begin="56875" end="57345" /> 
      <channel session="48" begin="57345" end="57615" /> 
      <channel session="1010379" begin="57615" end="57865" /> 
      <channel session="48" begin="57865" end="58079" /> 
      <channel session="1010354" begin="60848" end="60878" /> 
      <channel session="235" begin="60878" end="60893" /> 
      <channel session="48" begin="60893" end="60908" /> 
      <channel session="235" begin="60908" end="60913" /> 
      <channel session="1010072" begin="60913" end="60928" /> 
      <channel session="48" begin="60928" end="60933" /> 
      <channel session="235" begin="60933" end="60938" /> 
      <channel session="48" begin="60938" end="60943" /> 
      <channel session="235" begin="60943" end="60953" /> 
      <channel session="48" begin="60953" end="60968" /> 
      <channel session="235" begin="60968" end="60993" /> 
      <channel session="1010072" begin="60993" end="61483" /> 
      <channel session="1010499" begin="61483" end="61538" /> 
      <channel session="48" begin="61538" end="61543" /> 
      <channel session="47" begin="61543" end="61578" /> 
      <channel session="1010383" begin="61578" end="61603" /> 
      <channel session="1010072" begin="61603" end="63428" /> 
      <channel session="1010497" begin="63428" end="63438" /> 
      <channel session="1010319" begin="63438" end="63888" /> 
      <channel session="1010072" begin="63888" end="63898" /> 
      <channel session="235" begin="63898" end="63903" /> 
      <channel session="47" begin="63903" end="64368" /> 
      <channel session="48" begin="64368" end="64373" /> 
      <channel session="47" begin="64373" end="64378" /> 
      <channel session="1010072" begin="64378" end="64393" /> 
      <channel session="1010383" begin="64393" end="64408" /> 
      <channel session="1010001" begin="64408" end="64423" /> 
      <channel session="235" begin="64423" end="64428" /> 
      <channel session="1010469" begin="64428" end="64468" /> 
      <channel session="1010452" begin="64468" end="64473" /> 
      <channel session="48" begin="64473" end="64528" /> 
      <channel session="1010367" begin="64528" end="65053" /> 
      <channel session="48" begin="65053" end="65328" /> 
      <channel session="1010367" begin="65328" end="66613" /> 
      <channel session="1010383" begin="66613" end="66628" /> 
      <channel session="1010497" begin="66628" end="66823" /> 
      <channel session="47" begin="66823" end="66833" /> 
      <channel session="1010367" begin="66833" end="67538" /> 
      <channel session="1010383" begin="67538" end="67553" /> 
      <channel session="1010354" begin="67553" end="67563" /> 
      <channel session="1010497" begin="67563" end="67658" /> 
      <channel session="1010499" begin="67658" end="67713" /> 
      <channel session="1010072" begin="67713" end="67733" /> 
      <channel session="1010367" begin="67733" end="67803" /> 
      <channel session="48" begin="67803" end="68018" /> 
      <channel session="1010367" begin="68018" end="70963" /> 
      <channel session="1010383" begin="70963" end="71048" /> 
      <channel session="1010367" begin="71048" end="71848" /> 
      <channel session="1010383" begin="71848" end="72023" /> 
      <channel session="1010497" begin="72023" end="72043" /> 
      <channel session="1010072" begin="72043" end="72068" /> 
      <channel session="1010499" begin="72068" end="72083" /> 
      <channel session="47" begin="72083" end="72098" /> 
      <channel session="1010499" begin="72098" end="73643" /> 
      <channel session="1010072" begin="73643" end="73668" /> 
      <channel session="1010354" begin="73668" end="73683" /> 
      <channel session="1010383" begin="73683" end="73693" /> 
      <channel session="1010499" begin="73693" end="73708" /> 
      <channel session="1010072" begin="73708" end="73748" /> 
      <channel session="235" begin="73748" end="73753" /> 
      <channel session="1010497" begin="73753" end="73763" /> 
      <channel session="1010499" begin="73763" end="74323" /> 
      <channel session="1010497" begin="74323" end="74363" /> 
      <channel session="1010354" begin="74363" end="74378" /> 
      <channel session="1010499" begin="74378" end="74443" /> 
      <channel session="1010072" begin="74443" end="74453" /> 
      <channel session="1010499" begin="74453" end="74883" /> 
      <channel session="1010497" begin="74883" end="74938" /> 
      <channel session="1010072" begin="74938" end="74953" /> 
      <channel session="47" begin="74953" end="75603" /> 
      <channel session="1010497" begin="75603" end="77673" /> 
      <channel session="1010383" begin="77673" end="79208" /> 
      <channel session="1010354" begin="79208" end="79228" /> 
      <channel session="235" begin="79228" end="79233" /> 
      <channel session="1010497" begin="79233" end="79603" /> 
      <channel session="1010072" begin="79603" end="80088" /> 
      <channel session="1010497" begin="80088" end="80238" /> 
      <channel session="1010499" begin="80238" end="80253" /> 
      <channel session="235" begin="80253" end="80258" /> 
+1

這是非常相似的http://stackoverflow.com/questions/40787171/transform-data-from-xml-into -r-dataframe/40790195#40790195。使用標記爲(2)的解決方案,而不是在'// test'循環,'// channel'循環獲得'session','begin'和'end'屬性。通過其祖先獲取其「成員」節點始祖的「id」屬性,其「家庭」節點祖先的「id」屬性和其「家庭」節點祖先的「date」屬性。或者多做一點工作,您可以按照標記爲更快的解決方案(1)。 –

+0

謝謝格洛騰迪克! –

回答

2

這完全是勉強的,並且熟悉XML的結構。相對較新的purrr包在這裏非常有用。但是,我認爲你的問題還是有點模糊,缺乏有效的XML文件的超過有問題的:

library(xml2) 
library(purrr) 
library(dplyr) 

hh <- read_xml("~/data/so.xml") 

xml_find_all(hh, ".//household") %>% 
    map_df(function(x) { 
    id <- xml_attr(x, "id") 
    members <- xml_find_all(x, ".//member") 
    map_df(members, function(y) { 
     channel_sessions <- xml_find_all(y, ".//channel") 
     if (length(channel_sessions) == 0) { 
     NULL 
     } else { 
     xml_attrs(channel_sessions) %>% 
      map_df(as.list) %>% 
      mutate(memberid=xml_attr(y, "id"), 
       household=id) 
     } 
    }) 
    }) %>% 
    mutate(date=xml_find_first(hh, "//households") %>% 
      xml_attr("creation_date") %>% 
      as.Date()) 
## # A tibble: 176 × 6 
## session begin end memberid household  date 
##  <chr> <chr> <chr> <chr>  <chr>  <date> 
## 1 1010497 46722 46738  2 4FDYT14A1Q 2015-01-11 
## 2  48 46738 46743  2 4FDYT14A1Q 2015-01-11 
## 3  235 46743 46748  2 4FDYT14A1Q 2015-01-11 
## 4  47 46748 46753  2 4FDYT14A1Q 2015-01-11 
## 5 1010383 46753 46973  2 4FDYT14A1Q 2015-01-11 
## 6 1010497 46973 46983  2 4FDYT14A1Q 2015-01-11 
## 7 1010319 46983 46993  2 4FDYT14A1Q 2015-01-11 
## 8 1010072 46993 47578  2 4FDYT14A1Q 2015-01-11 
## 9 1010499 47578 47618  2 4FDYT14A1Q 2015-01-11 
## 10  47 47618 47633  2 4FDYT14A1Q 2015-01-11 
## # ... with 166 more rows 
0

隨着字變換XML在相同的標題,考慮XSLT,特殊旨在將源XML文件轉換爲最終用途需求的專用語言,例如導入R數據框。現在面臨的挑戰是R沒有專門的庫來運行XSLT。

儘管如此,R可以將命令行調用到external processers(即使在Python,PHP,Java等其他語言中)以將輸入轉換爲輸出。下面是兩個版本:適用於Windows(使用PowerShell對XslCompiledTransform調用)和Linux/Mac上(使用xsltproc通常安裝在大多數分佈):

XSLT腳本(另存爲的.xsl,一個良好的XML文件)

<xsl:transform xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"> 
<xsl:output version="1.0" encoding="UTF-8" indent="yes" /> 
<xsl:strip-space elements="*"/> 

    <xsl:template match="/households"> 
    <xsl:copy> 
    <xsl:apply-templates select="household"/> 
    </xsl:copy> 
    </xsl:template> 

    <xsl:template match="household|members|member|member_process|individual_audience|individual_audience_tvset"> 
    <xsl:apply-templates/> 
    </xsl:template> 

    <xsl:template match="channel"> 
    <tvset> 
     <date><xsl:value-of select="ancestor::households/@date"/></date> 
     <household_id><xsl:value-of select="ancestor::household/@id"/></household_id> 
     <member_id><xsl:value-of select="ancestor::member/@id"/></member_id> 
     <session><xsl:value-of select="@session"/></session> 
     <begin><xsl:value-of select="@begin"/></begin> 
     <end><xsl:value-of select="@end"/></end> 
    </tvset>  
    </xsl:template> 

    <xsl:template match="tvsets"/> 
</xsl:transform> 

的PowerShell腳本(保存爲名爲.ps1文件,僅Windows)

param ($xml, $xsl, $output) 

if (-not $xml -or -not $xsl -or -not $output) { 
    Write-Host "& .\xslt.ps1 [-xml] xml-input [-xsl] xsl-input [-output] transform-output" 
    exit; 
} 

trap [Exception]{ 
    Write-Host $_.Exception; 
} 

$xslt = New-Object System.Xml.Xsl.XslCompiledTransform; 
$xslt.Load($xsl); 
$xslt.Transform($xml, $output); 

Write-Host "generated" $output; 

[R(Windows)中

library(XML) 

system(paste0('Powershell.exe -File', 
       ' "C:\\Path\\To\\PowerShell\\Script.ps1"', 
       ' "C:\\Path\\To\\Input.xml"', 
       ' "C:\\Path\\To\\XSLT\\Script.xsl"', 
       ' "C:\\Path\\To\\Output.xml"')) 

df <- xmlToDataFrame("C:\\Path\\To\\Output.xml") 

head(df, 10) 
#   date household_id member_id session begin end 
# 1 2015-01-05 4FDYT14A1Q   2 1010497 46722 46738 
# 2 2015-01-05 4FDYT14A1Q   2  48 46738 46743 
# 3 2015-01-05 4FDYT14A1Q   2  235 46743 46748 
# 4 2015-01-05 4FDYT14A1Q   2  47 46748 46753 
# 5 2015-01-05 4FDYT14A1Q   2 1010383 46753 46973 
# 6 2015-01-05 4FDYT14A1Q   2 1010497 46973 46983 
# 7 2015-01-05 4FDYT14A1Q   2 1010319 46983 46993 
# 8 2015-01-05 4FDYT14A1Q   2 1010072 46993 47578 
# 9 2015-01-05 4FDYT14A1Q   2 1010499 47578 47618 
# 10 2015-01-05 4FDYT14A1Q   2  47 47618 47633 

[R(的Linux/Mac上)

library(XML) 

system('xsltproc /path/to/XSLT_Script.xsl /path/to/Input.xml -o path/to/Output.xml') 

df <- xmlToDataFrame("path/to/Output.xml")