2016-02-14 36 views
1

我有一個腳本,我正在寫解析一個csv文件。Python下一行if <! - 正則表達式不匹配

如果文件啓動,我希望能夠移動到下一行文本並繼續解析。

問題是我的正則表達式,我無法匹配。

if re.match(r'^.\<\!', line): 
    line.next() 

示例文本

<!-- Copyright Notice: © 2010 Racing NSW (and other parties working with it). NSW racing information,including fields, form and results, is subject to copyright which is owned by Racing NSW and other parties working with it. --> 

Meeting,17/02/16,CANT,Canterbury Park,Weights,TAB,+6m Entire Circuit,   ,   
Race,1,BENCHMARK 77 HANDICAP,BM77,BM77,1550,BM77  ,3U  ,~   ,HCP  ,54,0,0,17/02/2016,,   ,   ,   ,   ,BenchMark 77, Handicap, For Three-Years-Old and Upwards, No sex restriction,Of $40000. First $23025, second $7925, third $3960, fourth $1885, fifth $955, sixth $450, seventh $450, eighth $450, ninth $450, tenth $450 
Horse,1,Balboa Park (NZ),0,"Gai Waterhouse",Randwick,,0,54.5,3-1-1-0 $30000.00,,0,0,0,,65.00,G, 
Horse,2,Baylie Louise,0,"Matthew Dale",Canberra,,0,55,16-6-2-4 $112545.00,,0,0,0,,69.00,M, 
Horse,3,Beretta,0,"Kris Lees",Broadmeadow,,0,55.5,8-2-1-1 $38305.00,,0,0,0,,66.00,G, 
Horse,4,Elle Lou,0,"Chris Waller",Rosehill,,0,57.5,14-2-4-0 $141625.00,,0,0,0,,74.00,M, 
Horse,5,Got Unders,0,"Ken Lantry",Broadmeadow,,0,60,33-4-9-9 $140735.00,,0,0,0,,75.00,G, 
Horse,6,Lord de Air,0,"Bede Murray",Kembla Grange,,0,57,16-4-2-3 $89050.00,,0,0,0,,69.00,G, 
Horse,7,Lucky Liaison,0,"Kristen Buchanan",Wyong,,0,61,49-8-6-8 $257865.00,,0,0,0,,77.00,G, 
Horse,8,Makeadane,0,"John P Thompson",Randwick,,0,55,15-2-2-2 $65002.00,,0,0,0,,65.00,G, 
Horse,9,Miss Denni (NZ),0,"Chris Waller",Rosehill,,0,57.5,12-2-5-1 $102075.00,,0,0,0,,74.00,M, 
Horse,10,Multifacets (NZ),0,"Chris Waller",Rosehill,,0,54,6-1-0-0 $19845.00,,0,0,0,,62.00,C, 
Horse,11,Mydream,0,"Melissa Harrison",Kembla Grange,,0,56.5,34-8-2-3 $142520.00,,0,0,0,,72.00,M, 
Horse,12,Never Back Down,0,"Jim & Greg Lee",Randwick,,0,58,33-4-3-8 $151090.00,,0,0,0,,71.00,G, 
Horse,13,Orcym Sam,0,"Gwenda Markwell",Kembla Grange,,0,59,6-3-2-0 $44350.00,,0,0,0,,73.00,G, 
Horse,14,Recife Beach,0,"Kim Waugh",Wyong,,0,57,21-3-5-2 $77175.00,,0,0,0,,69.00,G, 
Horse,15,Soros,0,"Joseph Pride",Warwick Farm,,0,60,36-6-2-4 $249975.00,,0,0,0,,75.00,G, 
Horse,16,Spiritos,0,"Chris Waller",Rosehill,,0,55.5,8-2-0-1 $45585.00,,0,0,0,,67.00,G, 
Horse,17,Ultima Chance,0,"Scott Collings",Goulburn,,0,55,39-9-6-3 $104437.00,,0,0,0,,65.00,G, 
Race,2,BENCHMARK 72 HANDICAP,BM72,BM72,1250,BM72  ,3U  ,~   ,HCP  ,55.5,0,0,17/02/2016,, 

這是完整的文件

import csv 
import re 
from sys import argv 
SCRIPT, FILENAME = argv 


def out_file_name(file_name): 
    """take an input file and keep the name with appended _clean""" 
    file_parts = file_name.split(".",) 
    output_file = file_parts[0] + '_clean.' + file_parts[1] 
    return output_file 


def race_table(text_file): 
    """utility to reorganise poorly made csv entry""" 
    output_table = [] 
    for line in enumerate(text_file): 
     if re.match(r'^.\<\!', line): 
      line.next() 
     for record in text_file: 
      if record[0] == 'Meeting': 
       meeting = record[3] 
       rail = record[6] 
       weather = record[7] 
       track = record[8] 
      elif record[0] == 'Race': 
       date = record[13] 
       race = record[1] 
       benchmark = record[4] 
       distance = record[5] 
      elif record[0] == 'Horse': 
       number = record[1] 
       name = record[2] 
       jockey = record[6] 
       barrier = record[7] 
       weight = record[8] 
       results = record[9] 
       res_split = re.split('[- ]', results) 
       starts = res_split[0] 
       wins = res_split[1] 
       seconds = res_split[2] 
       thirds = res_split[3] 
       try: 
        prizemoney = res_split[4] 
       except IndexError: 
        prizemoney = 0 
       trainer = record[4] 
       location = record[5] 
       b_rating = record[15] 
       sex = record[16] 
       print(name, wins, seconds) 
       output_table.append((meeting, date, rail, weather, track, distance, 
            benchmark, race, number, name, sex, b_rating, 
            weight, barrier, starts, wins, seconds, 
            thirds, prizemoney, trainer, location, jockey 
            )) 
     return output_table 

MY_FILE = out_file_name(FILENAME) 

with open(FILENAME, 'r') as f_in, open(MY_FILE, 'w') as f_out: 
    CONTENT = csv.reader(f_in) 
    # print(content) 
    FILE_CONTENTS = race_table(CONTENT) 
    # print new_name 
    # f_out.write(str(FILE_CONTENTS)) 
    headers = ['MEETING', 'DATE', 'RAIL', 'WEATHER', 'TRACK', 'DISTANCE', 
       'BENCHMARK', 'RACE', 'NUMBER', 'NAME', 'SEX', 'B_RATING', 
       'WEIGHT', 'BARRIER', 'STARTS', 'WINS', 'SECONDS', 'THIRDS', 
       'PRIZEMONEY', 'TRAINER', 'LOCATION', 'JOCKEY'] 

    f_csv = csv.writer(f_out) 
    f_csv.writerow(headers) 
    f_csv.writerows(FILE_CONTENTS) 


if __name__ == '__main__': 
    pass 

回答

1

在表達式的開頭刪除的點:

>>> s = "<!-- Copyright Notice: © 2010 Racing NSW (and other parties working with it). NSW racing information,including fields, form and results, is subject to copyright which is owned by Racing NSW and other parties working with it. -->" 
>>> 
>>> re.match(r'^.\<\!', s) 
>>> re.match(r'^\<\!', s) 
<_sre.SRE_Match object at 0x10da7fed0> 

或者,你可以過濾評論耳機[R初始化csv.reader時:

with open(FILENAME, 'r') as f_in, open(MY_FILE, 'w') as f_out: 
    CONTENT = csv.reader(row for row in f_in if not row.startswith('<!--')) 
+0

像「不是」非常乾淨 – sayth

1
if re.match(r'^<!.*', line): 

刪除.在前面和end.Also添加.*沒有必要逃避<!