-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathmake_inputs.py
More file actions
85 lines (63 loc) · 2.3 KB
/
make_inputs.py
File metadata and controls
85 lines (63 loc) · 2.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import numpy as np
import pandas as pd
import json
from preprocess import rap_izer
import cPickle
print('make inputs!')
# read in csv of lines & convert to standard format:
df = pd.read_csv('fire_bars.csv')
# get rid of the empty lines in the dataset
df = df.loc[df['verses'] != '']
# open vocabulary:
vocab = pd.read_csv('rap_vocab.csv')
# remove the row if it has a frequency less than a threshold amount:
thresh = 2
vocab = vocab.loc[vocab['freqs'] >= thresh]
# turn vocab into a dictionary (lookup table):
LUT = { str(vocab.ix[i,1]) : vocab.ix[i,0] for i in range(len(vocab))}
LUT['UNK'] = len(LUT)+1 # add the UNKnown words at the end of dictionary
print('lookup table is the length of: ', len(LUT))
# save memory by deleting unneeded array
del vocab
def get_word_index(word):
'''
takes in the word, finds it location in the LUT or assigns
the UNK index number to it
'''
try:
# we have the word in vocab
indx = LUT[str(word)]
except:
# it's not in vocabulary
indx = LUT['UNK']
return indx
# for each "sse" tag, loop over each line and create (inpt,target) pairs, by index #
data = [] # will append data to this list...
song = -1 # counts up when reaches next song, set as -1 to be initialized as 0
for i in range(len(df)-1):
# ignores the last line because it will be covered in the code below
# turn on switch if sse - (start of new song):
if df['verses'][i] == 'sse':
song += 1 # record as next song
# don't add this line or the next line basically as anew song starting
else:
if df['verses'][i+1] == 'sse':
# if the next line is sse then song ends and we can get rid of the line:
# ignore
continue
else:
# turn the lines into a vector of words...
line1 = rap_izer(str(df['verses'][i])).split()
line2 = rap_izer(str(df['verses'][i+1])).split()
# convert each word into its index number, if it's not in
# there then give it a UNK number
l1 = [get_word_index(word) for word in line1]
l2 = [get_word_index(word) for word in line2]
# add these sequences to the dataset:
data.append([l1,l2])
if (i % 100) == 0:
# show the data every so often because, why not?
print('iteration %s\'s data looks like' % i)
print('input bar\n',l1,'\n\noutput bar\n',l2)
# save the 3d data set by pickling it:
cPickle.dump( data, open( "filename.pkl", "wb" ) )