2014-09-19 64 views
1

我有一個Excel電子表格我準備遷移到Access,日期列有多種格式的條目,例如:1963年至1969年,1968年8月至1968年9月,1972年,3月73日,24日-Jul,1980年10月2日,1980年8月29日,1946年7月等,「未註明」。我將把作爲關鍵字(地圖編號)和日期列的列拖入csv並寫回csv。 我可以去掉4位數的年份,但不是範圍。我不知道如何提取幾天和兩個數字年的手工重新格式化。我的代碼是不是很優雅,也許不是最好的做法:日期到類別

import csv, xlwt, re 

# create new Excel document and add sheet 
# from tempfile import TemporaryFile 
from xlwt import Workbook 
book = Workbook() 
sheet1 = book.add_sheet('Sheet 1') 

# populate first row with header 
sheet1.write(0,0,"Year") 
sheet1.write(0,1,"Map") 
sheet1.write(0,2,"As Entered") 

# count variable for populating sheet 
rowCount=0 

# open csv file and read 
with open('C:\dateTestMSDOs.csv', 'rb') as f: 
    reader=csv.reader(f) 
    for row in reader: 

     map = row[0] # first row is map number 
     dateRaw = row[1] # second row is raw date as entered 

     # write undated and blank entries 
     if dateRaw == 'undated': 
      yearStr = '0000' 
      rowCount +=1 
      sheet1.write(rowCount, 0, yearStr) 
      sheet1.write(rowCount, 1, map) 
      sheet1.write(rowCount, 2, dateRaw) 
      #print rowCount, yearStr, map, dateRaw, '\n' 
      yearStr='' 

     if dateRaw == '': 
      yearStr = 'NoEntry' 
      rowCount +=1 
      sheet1.write(rowCount, 0, yearStr) 
      sheet1.write(rowCount, 1, map) 
      sheet1.write(rowCount, 2, dateRaw) 
      #print rowCount, yearStr, map, dateRaw, '\n' 
      yearStr='' 

     # search and write instances of four consecutive digits 
     try: 
      year = re.search(r'\d\d\d\d', dateRaw) 
      yearStr= year.group() 
      #print yearStr, map, dateRaw 
      rowCount +=1 
      sheet1.write(rowCount, 0, yearStr) 
      sheet1.write(rowCount, 1, map) 
      sheet1.write(rowCount, 2, dateRaw) 
      #print rowCount, yearStr, map, dateRaw, '\n' 
      yearStr='' 

     # if none exist flag for cleaning spreadsheet and print 
     except: 
      #print 'Nope', map, dateRaw 
      rowCount +=1 
      yearStr='Format' 
      sheet1.write(rowCount, 0, yearStr) 
      sheet1.write(rowCount, 1, map) 
      sheet1.write(rowCount, 2, dateRaw) 
      #print rowCount, yearStr, map, dateRaw, '\n' 
      yearStr='' 
yearStr='' 
dateRaw='' 

book.save('D:\dateProperty.xls') 
print "Done!" 

我還想寫日期和月份以附加列以及拉範圍條目的第二個4位數的日期。

+0

你看過格式日期[日期時間](https://docs.python.org/2/library/datetime.html)嗎? – N1B4 2014-09-19 22:50:00

+0

[Pandas](http://pandas.pydata.org)具有強大的日期分析功能。我會看看這個。 – b10n 2014-09-19 23:17:43

回答

1

您可以嘗試使用dateutil。我認爲你仍然需要以不同的方式處理一些更難的格式。請參閱下面的實現:

代碼:

import dateutil.parser as dateparser 

date_list = ['1963 to 1969', 
      'Aug. 1968 to Sept. 1968', 
      'Mar-73', 
      '24-Jul', 
      'Oct. 2 1980', 
      'Aug 29, 1980', 
      'July 1946', 
      'undated']   

for d in date_list: 
    if 'to' in d: 
     a, b = d.split('to') 
     # Get the higher number. Use min to get lower of two. 
     print max(dateparser.parse(a.strip()).year, dateparser.parse(b.strip()).year) 
    elif d == 'undated': 
     print '0000' 
    else: 
     yr = dateparser.parse(d).year 
     print yr 

結果:

1969 
1968 
1973 
2014 
1980 
1980 
1946 
0000 
[Finished in 0.4s] 

只有突出的問題,我可以看到的是24-Jul因爲解析器假定返回2014日期當前日期,月份或年份代替缺失的組件,即。 Mar-73將成爲1973-03-20如果今天是本月20日,等

+0

絕對是我見過的最好的處理方式,比我的代碼更好。我希望當我們將焦點改變爲不使日期和月份成爲可搜索參數時,我會記得您的帖子。 – 2014-11-20 15:05:55

+0

不可否認,我會用'pandas'來代替,但是如果沒有訪問你的數據,這是我能想到的最好的。 – Manhattan 2014-11-20 15:08:35

0

不能完全確定,如果這是你所追求的或沒有,但我只是用一個「簡單」的正則表達式搜索,然後遍歷通過套組匹配,應用給定的函數定義。如果找到匹配,那麼被調用的函數(在regex_groups變量中找到)應返回帶有以下鍵的字典:start_day, start_month, start_year, end_day, end_month, end_year

然後,您可以對這些值進行任何操作。據我所知,絕對不是最乾淨的解決方案,但它起作用。

#!/usr/local/bin/python2.7 

import re 

# Crazy regex 
regex_pattern = '(?:(\d{4}) to (\d{4}))|(?:(\w+)\. (\d{4}) to (\w+)\. (\d{4}))|(?:(\w+)-(\d{2}))|(?:(\d{2})-(\w+))|(?:(\w+)\. (\d+), (\d{4}))|(?:(\w+) (\d+), (\d{4}))|(?:(\w+) (\d{4}))|(?:(\d{4}))' 

date_strings = [ 
    '1963 to 1969', 
    'Aug. 1968 to Sept. 1968', 
    '1972', 
    'Mar-73', 
    '24-Jul', 
    'Oct. 2, 1980', 
    'Aug 29, 1980', 
    'July 1946', 
] 

# Here you set the group matching functions that will be called for a matching group 
regex_groups = { 
    (1,2):  lambda group_matches: { 
    'start_day': '', 'start_month': '', 'start_year': group_matches[0], 
    'end_day': '', 'end_month': '', 'end_year': group_matches[1] 
    }, 
    (3,4,5,6): lambda group_matches: { 
    'start_day': '', 'start_month': group_matches[0], 'start_year': group_matches[1], 
    'end_day': '', 'end_month': group_matches[2], 'end_year': group_matches[3] 
    }, 
    (7,8):  lambda group_matches: { 
    'start_day': '', 'start_month': group_matches[0], 'start_year': group_matches[1], 
    'end_day': '', 'end_month': '', 'end_year': '' 
    }, 
    (9,10):  lambda group_matches: { 
    'start_day': group_matches[1], 'start_month': '', 'start_year': group_matches[0], 
    'end_day': '', 'end_month': '', 'end_year': '' 
    }, 
    (11,12,13): lambda group_matches: { 
    'start_day': group_matches[1], 'start_month': group_matches[0], 'start_year': group_matches[2], 
    'end_day': '', 'end_month': '', 'end_year': '' 
    }, 
    (14,15,16): lambda group_matches: { 
    'start_day': group_matches[1], 'start_month': group_matches[0], 'start_year': group_matches[2], 
    'end_day': '', 'end_month': '', 'end_year': '' 
    }, 
    (17,18): lambda group_matches: { 
    'start_day': '', 'start_month': group_matches[0], 'start_year': group_matches[1], 
    'end_day': '', 'end_month': '', 'end_year': '' 
    }, 
    (19,):  lambda group_matches: { 
    'start_day': '', 'start_month': '', 'start_year': group_matches[0], 
    'end_day': '', 'end_month': '', 'end_year': '' 
    }, 
} 

for ds in date_strings: 
    matches = re.search(regex_pattern, ds) 
    start_month = '' 
    start_year = '' 
    end_month = '' 
    end_year = '' 

    for regex_group, group_func in regex_groups.items(): 
    group_matches = [matches.group(sub_group_num) for sub_group_num in regex_group] 
    if all(group_matches): 
     match_data = group_func(group_matches) 
     print 
     print 'Matched:', ds 
     print '%s to %s' % ('-'.join([match_data['start_day'], match_data['start_month'], match_data['start_year']]), '-'.join([match_data['end_day'], match_data['end_month'], match_data['end_year']])) 

     # match_data is a dictionary with keys: 
     # * start_day 
     # * start_month 
     # * start_year 
     # * end_day 
     # * end_month 
     # * end_year 
     # If a group doesn't contain one of those items, then it is set to a blank string 

輸出:

Matched: 1963 to 1969 
--1963 to --1969 

Matched: Aug. 1968 to Sept. 1968 
-Aug-1968 to -Sept-1968 

Matched: 1972 
--1972 to -- 

Matched: Mar-73 
-Mar-73 to -- 

Matched: 24-Jul 
Jul--24 to -- 

Matched: Oct. 2, 1980 
2-Oct-1980 to -- 

Matched: Aug 29, 1980 
29-Aug-1980 to -- 

Matched: July 1946 
-July-1946 to -- 
+0

謝謝布萊斯。在時間不足之前,這是我想去的地方,工作參數改變了焦點。不過,我會保留這些代碼。我有一堆令人討厭的數據要在幾個月內清理乾淨,而不是更有效的方法,這將完全適應。 – 2014-11-20 14:47:58

0

你可以使用正則表達式定義的所有日期的可能的情況,是這樣的:

import re 
s = ['1963 to 1969', 'Aug. 1968 to Sept. 1968', 
    '1972', 'Mar-73', '03-Jun', '24-Jul', 'Oct. 2, 1980', 'Oct. 26, 1980', 
    'Aug 29 1980', 'July 1946'] 


def get_year(date): 
    mm = re.findall("\d{4}", date) 
    if mm: 
     return mm 
    mm = re.search("\w+-(\d{2})", date) 
    if mm: 
     return [mm.group(1)] 

def get_month(date): 
    mm = re.findall("[A-Z][a-z]+", date) 
    if mm: 
     return mm 

def get_day(date): 
    d_expr = ["(\d|\d{2})\-[A-Z][a-z]+","[A-Z][a-z]+[\. ]+(\d|\d{2}),"] 
    for expr in d_expr: 
     mm = re.search(expr, date) 
     if mm: 
      return [mm.group(1)] 

d = {} 
m = {} 
y = {} 

for idx, date in enumerate(s): 
    d[idx] = get_day(date) 
    m[idx] = get_month(date) 
    y[idx] = get_year(date) 

print "Year Dict: ", y 
print "Month Dict: ", m 
print "Day Dict: ", d 

至於導致你的天,月,年詞典。他們可以用來填充行。

輸出:

Year Dict: {0: ['1963', '1969'], 1: ['1968', '1968'], 2: ['1972'], 3: ['73'], 4: None, 5: None, 6: ['1980'], 7: ['1980'], 8: ['1980'], 9: ['1946']} 
Month Dict: {0: None, 1: ['Aug', 'Sept'], 2: None, 3: ['Mar'], 4: ['Jun'], 5: ['Jul'], 6: ['Oct'], 7: ['Oct'], 8: ['Aug'], 9: ['July']} 
Day Dict: {0: None, 1: None, 2: None, 3: None, 4: ['03'], 5: ['24'], 6: ['2'], 7: ['26'], 8: None, 9: None} 
+0

嘿,謝謝你的建議!我以爲我會發布我的笨拙的代碼工作。輸出.xls作爲LU表導入到我們的數據庫中,工作正常。缺點是任何空格或多餘的字符都會破壞它,但缺點是它強制原始數據輸入符合數據庫中單個可搜索年份類別的輸出(這是我們必須解決的問題,因爲我們涉及範圍,日期,月份等)。 – 2014-11-20 04:45:09

0

謝謝你的創新建議。經過考慮,我們決定從數據庫中的可搜索內容中刪除日期和月份,因爲只有相對少量的數據具有該詳細程度。這裏是我用來從一個漫長而雜亂的列表中提取和生成我需要的數據的代碼。

import csv, xlwt, re 
# create new Excel document and add sheet 
from xlwt import Workbook 
book = Workbook() 
sheet1 = book.add_sheet('Sheet 1') 

# populate first row with header 
sheet1.write(0,0,"MapYear_(Parsed)") 
sheet1.write(0,1,"Map_Number") 
sheet1.write(0,2,"As_Entered") 

# count variable for populating sheet 
rowCount=0 

# open csv file and read 
yearStr = '' 
with open('C:\mapsDateFix.csv', 'rb') as f: 
    reader=csv.reader(f) 
    for row in reader: 

     map = row[0] # first row is map number 
     dateRaw = row[1] # second row is raw date as entered 

     # write undated and blank entries 
     if dateRaw == 'undated': 
      yearStr = 'undated' 
      rowCount +=1 
      sheet1.write(rowCount, 0, yearStr) 
      sheet1.write(rowCount, 1, map) 
      sheet1.write(rowCount, 2, dateRaw) 
      #print rowCount, yearStr, map, dateRaw, '\n' 
      #yearStr='' 

     if yearStr != 'undated': 
      if dateRaw == '': 
       yearStr = 'NoEntry' 
       rowCount +=1 
       sheet1.write(rowCount, 0, yearStr) 
       sheet1.write(rowCount, 1, map) 
       sheet1.write(rowCount, 2, dateRaw) 
       #print rowCount, yearStr, map, dateRaw, '\n' 
       #yearStr='' 

     # search and write instances of four consecutive digits 
     if yearStr != dateRaw: 
      try: 
       year = re.search(r'\d\d\d\d', dateRaw) 
       yearStr= year.group() 
       #print yearStr, map, dateRaw 
       rowCount +=1 
       sheet1.write(rowCount, 0, yearStr) 
       sheet1.write(rowCount, 1, map) 
       sheet1.write(rowCount, 2, dateRaw) 
       #print rowCount, yearStr, map, dateRaw, '\n' 
       yearStr='' 

      # if none exist flag for cleaning spreadsheet and print 
      except: 
       #print 'Nope', map, dateRaw 
       rowCount +=1 
       yearStr='Format' 
       sheet1.write(rowCount, 0, yearStr) 
       sheet1.write(rowCount, 1, map) 
       sheet1.write(rowCount, 2, dateRaw) 
       #print rowCount, yearStr, map, dateRaw, '\n' 
       yearStr='' 
yearStr='' 
dateRaw='' 

book.save('D:\dateProperty.xls') 
print "Done!"