This is my function to merge 2 dataframes by matching field values.
#allow you to merge (left join) 2 dataframes
def levenstein_merge(dfa, dfb, left_on, right_on, limit = 0x7FFFFFFF): #0x7FFFFFFF is a max integer
import pandas as pd
import Levenshtein as l
a = dfa[left_on]
b = dfb[right_on]
mindst = 0x7FFFFFFF
klist = [None] * len(a)
lena = len(dfa.columns)
lenb = len(dfb.columns)
bcols = list(range(lena, lena+lenb)) #bcols is a list with dfb columns indexes after concatenation
res = dfa.copy()
to_concat = pd.DataFrame([], columns=dfb.columns)
res = pd.concat([res, to_concat], axis=1) #this will add columns to res from dfb
res.columns = list(res.columns)
for i, ival in enumerate(a):
mindst = 0x7FFFFFFF #reset mindst on each cycle
for k, kval in enumerate(b):
dst = l.distance(a[i], b[k])
if dst <= limit and dst < mindst: # if current distance less than saved
mindst = dst
klist[i] = k
if klist[i] != None: #if index found than
res.iloc[i,bcols] = dfb.iloc[klist[i],:] #set dfb columns values in res row to klist[i] row in dfb
return res
import pandas as pd
df = pd.DataFrame([['africa',1],['b',2],['down',3]], columns=['key','val'])
df2 = pd.DataFrame([['antica',11],['b',22],['cn',33]], columns=['key','val'])
res = levenstein_merge(df, df2,'key','key')
print(res)
#results
key val key val
0 antica 11 antica 11
1 b 2 b 22
2 down 3 cn 33
#expected
key val key val
0 africa 1 antica 11
1 b 2 b 22
2 down 3 cn 33
If i use different column names in df and df2 - everything is ok. I found that problem is here:
res.iloc[i,bcols] = dfb.iloc[klist[i],:]
res.iloc[i,bcols]
for some reason is not working properly.
IS THIS A BUG OR A FEATURE?
User contributions licensed under CC BY-SA 3.0