由於對我的其他答案的評論OP有關於搜索2d羣集的另一個問題。這是一些答案。
取自我的書庫eegpy,我使用find_clusters方法。它在2d陣列上執行一個步驟,查找所有高於/低於給定閾值的集羣。
這是我的代碼:
import pylab as plt
import numpy as np
from Queue import Queue
def find_clusters(ar,thres,cmp_type="greater"):
"""For a given 2d-array (test statistic), find all clusters which
are above/below a certain threshold.
"""
if not cmp_type in ["lower","greater","abs_greater"]:
raise ValueError("cmp_type must be in [\"lower\",\"greater\",\"abs_greater\"]")
clusters = []
if cmp_type=="lower":
ar_in = (ar<thres).astype(np.bool)
elif cmp_type=="greater":
ar_in = (ar>thres).astype(np.bool)
else: #cmp_type=="abs_greater":
ar_in = (abs(ar)>thres).astype(np.bool)
already_visited = np.zeros(ar_in.shape,np.bool)
for i_s in range(ar_in.shape[0]): #i_s wie i_sample
for i_f in range(ar_in.shape[1]):
if not already_visited[i_s,i_f]:
if ar_in[i_s,i_f]:
#print "Anzahl cluster:", len(clusters)
mask = np.zeros(ar_in.shape,np.bool)
check_queue = Queue()
check_queue.put((i_s,i_f))
while not check_queue.empty():
pos_x,pos_y = check_queue.get()
if not already_visited[pos_x,pos_y]:
#print pos_x,pos_y
already_visited[pos_x,pos_y] = True
if ar_in[pos_x,pos_y]:
mask[pos_x,pos_y] = True
for coords in [(pos_x-1,pos_y),(pos_x+1,pos_y),(pos_x,pos_y-1),(pos_x,pos_y+1)]: #Direct Neighbors
if 0<=coords[0]<ar_in.shape[0] and 0<=coords[1]<ar_in.shape[1]:
check_queue.put(coords)
clusters.append(mask)
return clusters
fn = "14318737.txt"
with open(fn, "r") as f:
labels = f.readline().rstrip("\n").split()[1:]
data = np.loadtxt(fn, skiprows=1, converters={0:lambda x: 0})
clusters = find_clusters(data, 0, "greater")
plot_data = np.ma.masked_equal(data[:,1:], 0)
plt.subplots_adjust(left=0.1, bottom=0.15, right=0.99, top=0.95)
plt.imshow(plot_data, cmap=plt.cm.get_cmap("Reds"), interpolation="nearest", aspect = "auto",
vmin=0, extent=[0.5,plot_data.shape[1]+0.5, plot_data.shape[0] - 0.5, -0.5])
plt.colorbar()
for cl in clusters:
plt.contour(cl.astype(np.int),[0.5], colors="k", lw=2)
plt.xticks(np.arange(1, len(labels)+2), labels, rotation=90, va="top", ha="center")
plt.show()
其給出的形式的圖像:
clusters
是布爾2D陣列(真/假)的列表。每個arrray表示一個簇,其中每個布爾值指示特定「點」是否是該簇的一部分。您可以在任何進一步的分析中使用它。
編輯
一些更有趣
現在的集羣
import pylab as plt
import numpy as np
from Queue import Queue
def find_clusters(ar,thres,cmp_type="greater"):
"""For a given 2d-array (test statistic), find all clusters which
are above/below a certain threshold.
"""
if not cmp_type in ["lower","greater","abs_greater"]:
raise ValueError("cmp_type must be in [\"lower\",\"greater\",\"abs_greater\"]")
clusters = []
if cmp_type=="lower":
ar_in = (ar<thres).astype(np.bool)
elif cmp_type=="greater":
ar_in = (ar>thres).astype(np.bool)
else: #cmp_type=="abs_greater":
ar_in = (abs(ar)>thres).astype(np.bool)
already_visited = np.zeros(ar_in.shape,np.bool)
for i_s in range(ar_in.shape[0]): #i_s wie i_sample
for i_f in range(ar_in.shape[1]):
if not already_visited[i_s,i_f]:
if ar_in[i_s,i_f]:
#print "Anzahl cluster:", len(clusters)
mask = np.zeros(ar_in.shape,np.bool)
check_queue = Queue()
check_queue.put((i_s,i_f))
while not check_queue.empty():
pos_x,pos_y = check_queue.get()
if not already_visited[pos_x,pos_y]:
#print pos_x,pos_y
already_visited[pos_x,pos_y] = True
if ar_in[pos_x,pos_y]:
mask[pos_x,pos_y] = True
for coords in [(pos_x-1,pos_y),(pos_x+1,pos_y),(pos_x,pos_y-1),(pos_x,pos_y+1)]: #Direct Neighbors
if 0<=coords[0]<ar_in.shape[0] and 0<=coords[1]<ar_in.shape[1]:
check_queue.put(coords)
clusters.append(mask)
return clusters
fn = "14318737.txt"
data = []
with open(fn, "r") as f:
labels = f.readline().rstrip("\n").split()[1:]
for line in f:
data.append([int(v) for v in line.split()[1:]])
data = np.array(data) #np.loadtxt(fn, skiprows=1, usecols=range(1,15))#converters={0:lambda x: 0})
clusters = find_clusters(data, 0, "greater")
large_clusters = filter(lambda cl: cl.sum()>5, clusters) #Only take clusters with five or more items
large_clusters = sorted(large_clusters, key=lambda cl: -cl.sum())
plot_data = np.ma.masked_equal(data[:,:], 0)
plt.subplots_adjust(left=0.1, bottom=0.15, right=0.99, top=0.95)
plt.imshow(plot_data, cmap=plt.cm.get_cmap("Reds"), interpolation="nearest", aspect = "auto",
vmin=0, extent=[-0.5,plot_data.shape[1]-0.5, plot_data.shape[0] - 0.5, -0.5])
plt.colorbar()
for cl in large_clusters:
plt.contour(cl.astype(np.int),[.5], colors="k", lw=2)
plt.xticks(np.arange(0, len(labels)+1), labels, rotation=90, va="top", ha="center")
print "Summary of all large clusters:\n"
print "#\tSize\tIn regions"
for i, cl in enumerate(large_clusters):
print "%i\t%i\t" % (i, cl.sum()),
regions_in_cluster = np.where(np.any(cl, axis=0))[0]
min_region = labels[min(regions_in_cluster)]
max_region = labels[max(regions_in_cluster)]
print "%s to %s" % (min_region, max_region)
plt.xlim(-0.5,plot_data.shape[1]-0.5)
plt.show()
我篩選具有包含在五點多所有集羣。我只繪製這些。您也可以使用每個羣集內的data
的總和。然後我按大小排序這些大集羣,然後下降。
最後,我打印所有大型羣集的摘要,包括它們在 之間的所有羣集的名稱。
除了你的數據,你可以顯示[你試過](http://mattgemmell.com/2008/12/08/what-have-you-tried/)? –
嗯,這個問答網站的實際編程。我不認爲圖表中有專家。 – Denis
你是什麼意思巨大?許多行或列?如果其中一個數字接近您創建劇情的像素數量,這將不是一個好方法。 –