2013-07-21 85 views
2

我有一個內容列表。每個內容都有一個觀看此內容的用戶列表。我想用Python創建一個如下圖所示的圖表。使用python創建圖表

我知道圓的半徑與觀看內容的用戶數成正比。圓圈之間的距離與聯合用戶的數量成正比。

所以我對解決這個問題的任何變體感興趣(算法或現有的包)。另外,也許有人知道,這種圖表如何被稱爲(鏈接雲?)。

enter image description here

你有任何想法如何做呢?

+3

你可能想看看['networkx'(http://networkx.github.io/)或['pyGraphviz'] (http://networkx.lanl.gov/pygraphviz/) – inspectorG4dget

+1

當我在過去使用類似這樣的數據完成工作時,我只是將其導出爲JSON並用[d3]將其可視化(http://d3js.org )。在d3詞典中,您正在尋找「強制」或「包裝」佈局(後者會生成「泡泡圖」)。 – roippi

+1

而不是使用JavaScript d3,你可以使用python包裝器(或者它是一個端口)d3py(https://github.com/mikedewar/d3py)。 – dilbert

回答

0

感謝您的回覆。如果我在這裏描述我如何解決我的問題,我認爲這將是有用的。

這是一段代碼。首先,我使用分層/ kmeans集羣化對數據進行聚類。然後我準備一個簡單的字典並將其轉換爲d3風格的json。 Json與來自http://bl.ocks.org/mbostock/4063530上的示例的html一起使用。

#coding: utf-8 
import argparse 
import json 
import logging 
import sys 
import numpy 
import pylab 
from sklearn.cluster import KMeans 
from common import init_db_connection 
from numpy import array 
from scipy.cluster import hierarchy 

logger = logging.getLogger("build2") 

CLUSTERING_TYPE_KMEANS = "KMEANS" 
CLUSTERING_TYPE_HIERARCHY = "HIERARCHY" 

def get_coord_names(cursor, limit): 
    sql = """ 
     SELECT DISTINCT unnest(user_ids) ivi_id 
     FROM (
      SELECT user_ids 
      FROM content_watched_by_users cw 
       JOIN content c ON c.id = cw.content_id 
      ORDER BY array_length(ivi_ids, 1) DESC 
      LIMIT %s 
     ) as t 
     ORDER BY ivi_id; 
    """ 
    logger.info(cursor.mogrify(sql, (limit,))) 
    cursor.execute(sql, (limit,)) 

    coord_names = [x[0] for x in cursor] 
    return coord_names 


def get_matrix_of_observations_and_objects(cursor, coords_name, limit): 
    sql = """ 
     SELECT c.title, user_ids 
     FROM content_watched_by_users cw 
      JOIN content c ON c.id = cw.content_id 
     ORDER BY array_length(user_ids, 1) DESC LIMIT %s""" 
    logger.info(cursor.mogrify(sql, (limit,))) 
    cursor.execute(sql, (limit,)) 

    logger.info(u"Начинаем получать матрицу наблюдений и массив объектов") 
    matrix = [] 
    objects = [] 

    for content_id, user_ids in cursor: 
     logger.info(u"Обрабатывается %s", content_id) 
     objects.append((content_id, len(user_ids))) 

     row = [0] * len(coords_name) 
     for user_id in user_ids: 
      try: 
       row[coords_name.index(user_id)] = 1 
      except ValueError: 
       logger.error(u"Что-то не так с user_ids %s", user_ids) 

     matrix.append(row) 
    logger.info(u"Матрица наблюдений и массив объектов получены") 
    return array(matrix), objects 


def fcluster_to_d3_dict(fcluster, objects, name_cluster=False): 
    d = {"name": "", "children": []} 
    for i in range(max(fcluster)): 
     d["children"].append({"name": "", "children": []}) 

    for index, parent_id in enumerate(fcluster): 
     parent = d["children"][parent_id - 1] 
     parent["children"].append({"name": objects[index][0], "size": objects[index][1]}) 

     if name_cluster and not parent["name"]: 
      parent["name"] = objects[index][0] 
    return d 


def code_to_d3_dict(code, objects, name_cluster=False): 
    d = {"name": "", "children": []} 
    for i in range(max(code) + 1): 
     d["children"].append({"name": "", "children": []}) 

    for index, parent_id in enumerate(code): 
     parent = d["children"][parent_id] 
     parent["children"].append({"name": objects[index][0], "size": objects[index][1]}) 

     if name_cluster and not parent["name"]: 
      parent["name"] = objects[index][0] 
    return d 


def save_to_json(result_dict, output_file="d3/flare.json"): 
    logger.info(u"Перегоняем в JSON") 
    f = open(output_file, "w") 
    json.dump(result_dict, f) 
    f.close() 
    logger.info(u"JSON сохранен по адресу: %s", output_file) 


def hierarchy_clustering(matrix, objects, threshold, name_cluster): 
    Z = hierarchy.linkage(matrix, method='ward') 
    fcluster = hierarchy.fcluster(Z, threshold, 'distance') 

    hierarchy.dendrogram(Z) 
    pylab.savefig("temp.png") 

    logger.info(fcluster) 
    result_dict = fcluster_to_d3_dict(fcluster, objects, name_cluster) 
    return result_dict 


def kmeans_clustering(matrix, objects, k, name_cluster=False): 
    S = 1 - (matrix/numpy.max(matrix)) 
    db = KMeans(n_clusters=k).fit(S) 
    logger.info(db.labels_) 
    result_dict = code_to_d3_dict(db.labels_, objects, name_cluster) 
    return result_dict 


if __name__ == "__main__": 
    parser = argparse.ArgumentParser(description=u'Скрипт для получения красивого графа') 
    # БД 
    parser.add_argument('--db_host', default="localhost", type=str, dest="db_host", 
         help=u'Хост БД, по умолчанию: localhost') 
    parser.add_argument('--db_port', default="5432", type=str, dest="db_port", 
         help=u'Порт БД, по умолчанию: 5432') 
    parser.add_argument('--db_name', default="da_test", type=str, dest="db_name", 
         help=u'Имя БД, по умолчанию: da') 
    parser.add_argument('--db_user', default="da", type=str, dest="db_user", 
         help=u'Пользователь БД, по умолчанию: da') 
    # общее 
    parser.add_argument("--log-level", default='INFO', type=str, dest="log_level", 
         choices=['DEBUG', 'INFO', 'WARNINGS', 'ERROR'], help=u"Уровень логирования") 
    parser.add_argument('-l', '--limit', required=True, type=int, dest="limit", 
         help=u'Количество контента в выборке. ' 
           u'Контент осортирован по количеству просмотревших его пользователей') 
    parser.add_argument('-o', '--output', required=True, type=str, dest="output_file_path", 
         help=u'Куда сохранять JSON-результат') 
    parser.add_argument('-n', '--name_cluster', action="store_true", dest="name_cluster", 
         help=u'Именовать кластеры по первому элементу в кластере') 
    parser.add_argument("-c", "--clustering", default=CLUSTERING_TYPE_KMEANS, type=str, dest="clustering_type", 
         choices=[CLUSTERING_TYPE_KMEANS, CLUSTERING_TYPE_HIERARCHY], help=u"Тип кластеризации") 
    # kmeans 
    parser.add_argument('-k', '--max_k', type=int, dest="max_k", 
         help=u'Максимальное число кластеров. Только для kmeans') 
    # иерархическая 
    parser.add_argument('-t', '--threshold', type=float, dest="threshold", 
         help=u'Граница разделения на плоские кластеры. Только для иерархической кластеризации') 

    args = parser.parse_args() 
    logging.basicConfig(stream=sys.stdout, level=getattr(logging, args.log_level), format="%(asctime)s :: %(message)s") 

    connection = init_db_connection(args.db_host, args.db_port, args.db_user, args.db_name) 
    cursor = connection.cursor() 
    coords_name = get_coord_names(cursor, args.limit) 
    matrix, objects = get_matrix_of_observations_and_objects(cursor, coords_name, args.limit) 
    connection.close() 

    if args.clustering_type == CLUSTERING_TYPE_KMEANS: 
     result_dict = kmeans_clustering(matrix, objects, args.max_k, args.name_cluster) 
    elif args.clustering_type == CLUSTERING_TYPE_HIERARCHY: 
     result_dict = hierarchy_clustering(matrix, objects, args.threshold, args.name_cluster) 
    else: 
     raise Exception(u"Неизвестный тип кластеризации") 
    save_to_json(result_dict, args.output_file_path) 

結果如下:

KMeans clustering