
时间:2017-01-28 01:13:31

标签: python sorting nlp


import re
import math
from collections import Counter
import itertools

#first understadn this code so that we can manipulate it.
WORD = re.compile(r'\w+')

def get_cosine(vec1, vec2):
    intersection =  set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x]**2 for x in vec1.keys()])
    sum2 = sum([vec2[x]**2 for x in vec2.keys()])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
        return float(numerator) / denominator

def text_to_vector(text):
    words = WORD.findall(text)
    return Counter(words)

#count the number of tweets set it to a variable and then set it as the length of this or  what ever
#This is where the text comes from
with open("positive copy.txt", "r") as pt:
    lines = pt.readlines()
    # Count how many lines we have
    count = len(lines)
    # Create a count * count size matrix
    Matrix = [[1 for x in range(count)] for y in range(count)] 
    # Loop through lines assigning x as the number of line we're on and lineA as it's text
    for x, lineA in enumerate(lines):
        vectorA = text_to_vector(lineA)
        for y, lineB in enumerate(itertools.islice(lines, count - x)):
            vectorB = text_to_vector(lineB)
            cosine = get_cosine(vectorA, vectorB)
            print lineA, lineB, "\n Cosine:", cosine, "\n"
            Matrix[y][x]=get_cosine(vectorA, vectorB)
            Matrix[x][y]=get_cosine(vectorA, vectorB)
    print Matrix


Hello my name is Jeff
Hello everyone I’m named Jeff
this has absolutely nothing to do
everyone Im a doctor
hello I don’t even know whats happening
whats  happening is that you not know


[[0.9999999999999998, 0.33806170189140655, 0.0, 0.0, 0.0, 0.16903085094570328], [0.33806170189140655, 0.9999999999999999, 0.0, 0.1889822365046136, 0.13363062095621217, 1], [0.0, 0.0, 1.0000000000000002, 0.0, 1, 1], [0.0, 0.1889822365046136, 0.0, 1, 1, 1], [0.0, 0.13363062095621217, 1, 1, 1, 1], [0.16903085094570328, 1, 1, 1, 1, 1]]


0 个答案:
