我試圖創建一個使用規範化的tf-idf作爲功能的K-means模型。我得到'ZeroDivisionError:浮動除以零'。以下是代碼。請讓我知道是否需要更多細節。Python 2.7 - GraphLab:ZeroDivisionError:float除零
import graphlab as gl
data = gl.SFrame(data)
data['tf_idf'] = gl.text_analytics.tf_idf(data['text'])
def normalize(d, target=1.0):
raw = sum(d.values())
try:
factor = target/raw
except ZeroDivisionError:
print 'Zero Error in file ', d['file_name']
return {key:value*factor for key,value in d.iteritems()}
data['tf_idf_norm'] = data['tf_idf'].apply(normalize)
print data[0]
{'file_name': 'ap-4081.pdf.txt',
'tf_idf_norm': {'september': 0.006612160101629999, 'issued': 0.004914445160361691, 'declaration': 0.018380116959675345, 'pursuant': 0.0015236875459684344, 'held': 0.006661456734504585, 'edt': 0.01993691915396277, 'its': 0.0018031759056466382, 'before': 0.000628458524686868, 'prehearing': 0.03603879166997583, 'mail': 0.010151784816687623, 'administrative': 0.0038202023891634983, 'scheduled': 0.02675584978978817, 'division': 0.003714090891806661, 'greater': 0.014401469513143977, 'express': 0.015442536961835316, 'judge': 0.0056255693137487245, 'postal': 0.01448758917369378, 'timely': 0.009547408004733245, 'postponing': 0.023771054663189694, 'james': 0.013851907261754153, 'establishing': 0.015525275482306627, 'securities': 0.0009534453943686734, 'release': 0.000994876598539298, 'served': 0.007549203784675396, 'cliffs': 0.040192712832365154, 'realty': 0.027963087504615878, 'financial': 0.006067940095145491, 'instituting': 0.0047412698163078965, 'are': 0.0024740930507990123, 'fails': 0.013316168540074749, 'proceeding': 0.0031604332608950046, 'appear': 0.010272410560940281, 'for': 0.00039400705802880267, 'defend': 0.01741380246078061, 'asia': 0.028632052997003922, 'deemed': 0.008702433747892771, 'august': 0.04977168128599991, 'exchange': 0.0005249518349832802, 'answers': 0.015575372135328506, 'respondents': 0.022052537701443286, 'corp': 0.010689517733349321, 'eastland': 0.04688688851929564, 'revoked': 0.013668927632973267, 'against': 0.00388458245265655, 'telephonic': 0.016974699222910256, 'incorporated': 0.014218288205022512, 'ltd': 0.014589362228277223, 'postponed': 0.0209823757486391, 'otherwise': 0.007582793422109354, 'respondent': 0.0054516655648239295, 'washington': 0.005091825331541719, 'hearing': 0.02249732139496055, 'registration': 0.00852794089032502, 'conference': 0.03047929459424198, 'america': 0.001440915940409403, 'service': 0., 'holdings': 0.013681846704902636, 'due': 0.006408671617186687, 'commission': 0.0003703379846648254, 'enforcement': 0.0040660794190181156, 'that': 0.0008014938443938574, 'filed': 0.004601893518537033, 'with': 0.0011834598692978579, 'commence': 0.01644634375933502, 'accordance': 0.006995433441162778, 'default': 0.010296776564423474, 'rulings': 0.00878134432538458, 'will': 0.004454693796954499, 'matter': 0.0008886177351788676, 'were': 0.00495349246481796, 'grimes': 0.019846163536283806, 'and': 0.00019120815327678883, 'states': 0.0006306247910547858, 'file': 0.0014093253240757065, 'scheduling': 0.015608961772762466, 'any': 0.0024796720760966906, 'united': 0.0007573801483105002, 'answer': 0.010898516012467644, 'granite': 0.03675899168009445, 'practice': 0.005139863202847838, 'shall': 0.004531342051741035, 'act': 0.0, 'law': 0.003763823781424106, 'oip': 0.051638100949650705, 'rule': 0.0022896158932922445, 'order': 0.0013019082720686934, 'proceedings': 0.005189982291505062, 'the': 0.0003944745275206197, 'section': 0.0023716422033868588},
'tf_idf': {'september': 0.9050645370154226, 'issued': 0.6726833539094077, 'declaration': 2.515848344672878, 'pursuant': 0.20856052215192625, 'held': 0.9118122009441979, 'edt': 2.7289415601335865, 'its': 0.24681655330746285, 'before': 0.08602264841392737, 'prehearing': 4.93294654032065, 'mail': 1.3895641188014474, 'administrative': 0.5229047169927326, 'scheduled': 3.662308599647982, 'division': 0.5083803026181314, 'greater': 1.97125585843607, 'express': 2.113755921043353, 'judge': 0.7700211743422583, 'postal': 1.9830438141881122, 'timely': 1.306837746316985, 'postponing': 3.253753425874317, 'james': 1.8960324371984822, 'establishing': 2.125081070400406, 'securities': 0.1305064609991892, 'release': 0.1361775144891726, 'served': 1.0333259514584336, 'cliffs': 5.501530282373367, 'financial': 0.8305723558472731, 'registration': 1.1672943115358345, 'are': 0.3386508867204352, 'fails': 1.8227011641129327, 'proceeding': 0.43259631074797295, 'appear': 1.4060752258672398, 'for': 0.053931213109576576, 'section': 0.3246275377157642, 'asia': 3.9191210423271863, 'deemed': 1.191178684406377, 'august': 6.812687985048638, 'exchange': 0.07185477698391116, 'answers': 2.1319382401265434, 'respondents': 3.018524887177448, 'corp': 1.4631683549022034, 'eastland': 6.417821014247522, 'revoked': 1.8709864052723948, 'against': 0.531716982797367, 'telephonic': 2.323476452025422, 'incorporated': 1.9461822208839539, 'ltd': 1.9969743877240589, 'postponed': 2.87204240377426, 'otherwise': 1.037923660707063, 'respondent': 0.7462174379555567, 'washington': 0.696962938801067, 'hearing': 3.07940634519689, 'instituting': 0.648979320707021, 'conference': 4.171969254600169, 'law': 0.5151876808461564, 'service': 1.686270670295957, 'holdings': 1.8727547523291366, 'due': 0.8772112775465153, 'commission': 0.050691418761513035, 'enforcement': 0.5565600696713887, 'that': 0.10970751525181831, 'realty': 3.827553848801696, 'with': 0.16199056620216346, 'commence': 2.2511557904457957, 'accordance': 0.9575265316227637, 'default': 1.4094104151733782, 'rulings': 1.2019798695237245, 'will': 0.6097531392036623, 'matter': 0.12163292883290797, 'were': 0.6780281020682886, 'grimes': 2.7165190401350294, 'and': 0.026172342480994967, 'states': 0.08631916435383152, 'file': 0.19290675850759448, 'scheduling': 2.136535949375173, 'any': 0.33941453700573243, 'united': 0.10366928548906945, 'proceedings': 0.7103985456464252, 'answer': 1.4917757884518863, 'granite': 5.031526653127632, 'practice': 0.7035383049574554, 'shall': 0.6202446603049608, 'act': 0.1691336021724683, 'america': 0.19723084414778347, 'oip': 7.068161268029516, 'rule': 0.31339987486008647, 'order': 0.17820364137975558, 'filed': 0.629901660385122, 'the': 0.05399519977243369, 'defend': 2.3835803760951273},
'text': 'united states america before the securities and exchange commission washington administrative proceedings rulings release august administrative proceeding file the matter eastland financial corp granite cliffs incorporated and greater asia realty holdings ltd order postponing hearing and scheduling prehearing conference august the securities and exchange commission issued order instituting proceedings oip against respondents pursuant section the securities exchange act the hearing scheduled commence august august the division enforcement filed declaration service establishing that respondents were served with the oip postal service express mail august accordance with rule practice respondents answers are due august oip order that the hearing scheduled for august postponed and telephonic prehearing conference shall held edt september any respondent that fails timely file answer appear the prehearing conference otherwise defend the proceeding will deemed default and the registration its securities will revoked oip james grimes administrative law judge'}
model = gl.clustering.kmeans.create(data,num_clusters=4,features=['tf_idf_norm'])
[ERROR] graphlab.toolkits._main: Toolkit error: Exception in python callback function evaluation:
ZeroDivisionError('float division by zero',):
Traceback (most recent call last):
File "graphlab/cython/cy_pylambda_workers.pyx", line 426, in graphlab.cython.cy_pylambda_workers._eval_lambda
File "graphlab/cython/cy_pylambda_workers.pyx", line 169, in graphlab.cython.cy_pylambda_workers.lambda_evaluator.eval_simple
File "<ipython-input-14-405770e0af16>", line 3, in normalize
ZeroDivisionError: float division by zero
ERROR:graphlab.toolkits._main:Toolkit error: Exception in python callback function evaluation:
ZeroDivisionError('float division by zero',):
Traceback (most recent call last):
File "graphlab/cython/cy_pylambda_workers.pyx", line 426, in graphlab.cython.cy_pylambda_workers._eval_lambda
File "graphlab/cython/cy_pylambda_workers.pyx", line 169, in graphlab.cython.cy_pylambda_workers.lambda_evaluator.eval_simple
File "<ipython-input-14-405770e0af16>", line 3, in normalize
ZeroDivisionError: float division by zero
---------------------------------------------------------------------------
ToolkitError Traceback (most recent call last)
<ipython-input-17-1591a6c5df9a> in <module>()
----> 1 model = gl.clustering.kmeans.create(data,num_clusters=4,features=['tf_idf_norm'])
/home/praveen/anaconda/lib/python2.7/site-packages/graphlab/toolkits/clustering/kmeans.pyc in create(dataset, num_clusters, features, label, initial_centers, max_iterations, batch_size, verbose)
659
660 ## Create and return the model
--> 661 params = _gl.toolkits._main.run('kmeans_train', opts, verbose)
662 return KmeansModel(params['model'])
/home/praveen/anaconda/lib/python2.7/site-packages/graphlab/toolkits/_main.pyc in run(toolkit_name, options, verbose, show_progress)
87 _get_metric_tracker().track(metric_name, value=1, properties=track_props, send_sys_info=False)
88
---> 89 raise ToolkitError(str(message))
ToolkitError: Exception in python callback function evaluation:
ZeroDivisionError('float division by zero',):
Traceback (most recent call last):
File "graphlab/cython/cy_pylambda_workers.pyx", line 426, in graphlab.cython.cy_pylambda_workers._eval_lambda
File "graphlab/cython/cy_pylambda_workers.pyx", line 169, in graphlab.cython.cy_pylambda_workers.lambda_evaluator.eval_simple
File "<ipython-input-14-405770e0af16>", line 3, in normalize
ZeroDivisionError: float division by zero
UPDATE:
下面的線工作正常。
data['tf_idf_norm'] = data['tf_idf'].apply(lambda x: {key:value*(1.0/sum(x.values())) for key, value in x.iteritems()})
非常感謝您的回答!沒有意識到sFrame中的一行實際上是一本字典。 – Praveen
但是當我使用標準化列作爲訓練K均值模型的特徵時,我仍然會得到相同的錯誤。相應地更新了問題。 – Praveen
我沒有你的原始數據,所以很難重新創建,但我會嘗試在我回家的時候嘲笑一些東西。上面的字典示例中的tf_idf是否表示所有輸入的外觀?你知道具體哪個輸入導致中斷或者是否有輸入嗎? – Lost