#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import nltk
import re
from nltk.tree import *
from nltk.chunk.util import tagstr2tree
from nltk import word_tokenize, pos_tag
text = "Yarın, Mehmet ile birlikte Ankara'da ki Nüfus Müdürlüğü'ne, Aziz
Yıldırım ile birlikte, Şükrü Saraçoğlu Stadı'na gideceğiz.".decode("utf-8")
tagged_text = pos_tag(word_tokenize(text))
tagged_text2 = word_tokenize(text)
grammar = "NP:{<NNP>+}"
cp = nltk.RegexpParser(grammar)
result = cp.parse(tagged_text)
for tree in result:
print(tree)
wrapped = "(ROOT "+ str(result) + ")" # Add a "root" node at the top
trees = nltk.Tree.fromstring(wrapped, read_leaf=lambda x: x.split("/")[0])
for tree in trees:
print(tree.leaves())
for tree2 in result:
print(nltk.Tree.fromstring(str(tree2), read_leaf=lambda x: x.split("/")[0]))
(NP Yar\u0131n/NNP)
(u',', ',')
(NP Mehmet/NNP)
(u'ile', 'NN')
(u'birlikte', 'NN')
(NP Ankara'da/NNP ki/NNP Nufus/NNP Mudurlugu'ne/NNP)
(u',', ',')
(NP Aziz/NNP Y\u0131ld\u0131r\u0131m/NNP)
(u'ile', 'NN')
(u'birlikte', 'NN')
(u',', ',')
(NP Sukru/NNP Saracoglu/NNP Stad\u0131'na/NNP)
(u'gidece\u011fiz', 'NN')
(u'.', '.')
['Yar\\u0131n', ',', 'Mehmet', 'ile', 'birlikte', "Ankara'da", 'ki', 'Nufus', "Mudurlugu'ne", ',', 'Aziz', 'Y\\u0131ld\\u0131r\\u0131m', 'ile', 'birlikte', ',', 'Sukru', 'Saracoglu', "Stad\\u0131'na", 'gidecegiz', '.']
(NP Yar\u0131n)
(u',', ',')
(NP Mehmet)
(u'ile', 'NN')
(u'birlikte', 'NN')
(NP Ankara'da ki Nufus Mudurlugu'ne)
(u',', ',')
(NP Aziz Y\u0131ld\u0131r\u0131m)
(u'ile', 'NN')
(u'birlikte', 'NN')
(u',', ',')
(NP Sukru Saracoglu Stad\u0131'na)
(u'gidece\u011fiz', 'NN')
(u'.', '.')
我引用自:How can I remove POS tags before slashes in nltk?
我想分組適當的名稱,並刪除標籤,但是當我使用的解決方案。它的效果整個文本和之後,我的塊解析消失。我真的嘗試瞭解樹結構,但我如何應用for語句中的去除函數。我希望我的輸出如下:
我想要的輸出:
[Yar\u0131n]
[,]
[Mehmet]
[ile]
[birlikte]
[Ankara'da ki Nufus Mudurlugu'ne]
...
...
此外,我不能使用UTF-8,你看我的輸出是充滿了非ASCII字符處理。我該如何處理它?
編輯:
for i in range(len(tree)):
arr.append(nltk.Tree.fromstring(str(tree[i]), read_leaf=lambda x: x.split("/")[0]).leaves())
print(arr[i])
我發現我建議立即進行刪除寫什麼代碼,但現在我有以下錯誤。我想我不能在我的數組上添加標點符號。
['Yar\\u0131n']
Traceback (most recent call last):
File "./chunk2.py", line 61, in <module>
arr.append(nltk.Tree.fromstring(str(tree[i]), read_leaf=lambda x: x.split("/")[0]).leaves())
File "/usr/local/lib/python2.7/dist-packages/nltk/tree.py", line 630, in fromstring
cls._parse_error(s, match, open_b)
File "/usr/local/lib/python2.7/dist-packages/nltk/tree.py", line 675, in _parse_error
raise ValueError(msg)
ValueError: Tree.read(): expected u'(' but got ','
at index 0.
","
^
嘗試'文本= U「Yarın,穆罕默德ILE birlikte Ankara'daきNüfusMüdürlüğü'ne,阿齊茲 Yıldırımile birlikte,ŞükrüSaraçoğluStadı'nagideceğiz。「' – alvas
恐怕它沒有奏效。順便說一句,你可以幫我解決我編輯過的問題嗎? –
你想要的輸出是什麼?您輸出字符串並將字符串讀回樹中。我想如果我們知道輸出是什麼,你可以直接操縱Tree對象來獲取它。 – alvas