Improving The Speed When Calculating Permutation On Multiple Elements In List Of Arrays
Suppose we have an array of a list containing indices. Each row (i.e. array) is associated to a specific user id. The algorithm only stores indices if the user appears more than on
Solution 1:
Function using dict:
def split_by_num_new(data, k):
temp_indices = []
user_indices = data.user.to_numpy()
user_split_indices = _groupby_user(user_indices, True)
user_split_indices=list(filter (lambda x: len(x)>1, user_split_indices))
loop_start = timeit.default_timer()
for u_data in user_split_indices:
u_data_len = len(u_data)
perm_ = pd.DataFrame(itertools.permutations(u_data))
p_ = perm_.set_index(perm_.shape[1]-1).to_dict()
append_start = timeit.default_timer()
temp_indices.append(p_)
append_stop = timeit.default_timer()
print('append Time Completed at : ', append_stop - append_start)
loop_stop = timeit.default_timer()
print('Loop Time Completed at : ', loop_stop - loop_start)
return temp_indices
Call the function that uses dict:
temp_indices = split_by_num_new(data,k=1)
c = pd.DataFrame()
for ind in range(len(temp_indices)):
print(ind)
c= pd.concat([c,pd.DataFrame(temp_indices[ind][0].items())],axis=0)
append Time Completed at : 5.999991117278114e-07append Time Completed at : 5.999991117278114e-07
Loop Time Completed at : 0.00579959999959101
Total time on on the entire dataset of (1311612, 60) takes : 3089.5768801999984 secs
Function using dict, map and lambda:
def func(u_data):
perm_ = pd.DataFrame(itertools.permutations(u_data))
p_ = perm_.set_index(perm_.shape[1]-1).to_dict()
return p_
def split_by_num_new(data, k):
temp_indices = []
user_indices = data.user.to_numpy()
user_split_indices = _groupby_user(user_indices, True)
user_split_indices=list(filter (lambda x: len(x)>1, user_split_indices))
temp_indices = list(map(lambda i: func(i), user_split_indices))
return temp_indices
Calling function using dict, map and lambda:
f_start = timeit.default_timer()
temp_indices_ = split_by_num_new(data,k=1)
function_time = timeit.default_timer()
print('funct Time Completed at : ', function_time - f_start)
temp_indices= pd.DataFrame()
for ind in range(len(temp_indices_)):
# print(ind)
temp_indices = pd.concat([temp_indices,pd.DataFrame(temp_indices_[ind[0].items())],axis=0)
temp_indices = temp_indices.rename(columns={0:'ind',1:'label_ind'})
Total time on the entire dataset of (1311612, 60) takes : 2083.1114619 secs
Older function that uses pandas:
defsplit_by_num_pandas(data, k):
temp_indices = pd.DataFrame()
user_indices = data.user.to_numpy()
user_split_indices = _groupby_user(user_indices, True)
user_split_indices=list(filter (lambda x: len(x)>1, user_split_indices))
loop_start = timeit.default_timer()
for u_data in user_split_indices:
u_data_len = len(u_data)
perm_ = pd.DataFrame(itertools.permutations(u_data)).drop_duplicates(subset=u_data_len-1, keep="first").set_index(u_data_len-1).stack().reset_index().rename(columns={'level_1': 'user_',u_data_len-1:'ind',k-1:'label_ind'})
concat_start = timeit.default_timer()
temp_indices = pd.concat([temp_indices,perm_],axis=0)
concat_stop = timeit.default_timer()
print('concat Time Completed at : ', concat_stop - concat_start)
loop_stop = timeit.default_timer()
print('Loop Time Completed at : ', loop_stop - loop_start)
return temp_indices,user_split_indices
Call function that uses pandas:
temp_indices_pd = split_by_num_pandas(data,k=1)
concat Time Completed at : 0.00038189999941096175
concat Time Completed at : 0.0004867000006925082
Loop Time Completed at : 0.011297000000922708
Post a Comment for "Improving The Speed When Calculating Permutation On Multiple Elements In List Of Arrays"