python pandas iloc bug or a feature?


This is my function to merge 2 dataframes by matching field values.

#allow you to merge (left join) 2 dataframes
def levenstein_merge(dfa, dfb, left_on, right_on, limit = 0x7FFFFFFF): #0x7FFFFFFF is a max integer
    import pandas as pd        
    import Levenshtein as l
    a = dfa[left_on]
    b = dfb[right_on]

    mindst = 0x7FFFFFFF
    klist = [None] * len(a)
    lena = len(dfa.columns) 
    lenb = len(dfb.columns) 
    bcols = list(range(lena, lena+lenb)) #bcols is a list with dfb columns indexes after concatenation

    res = dfa.copy()
    to_concat = pd.DataFrame([], columns=dfb.columns)
    res = pd.concat([res, to_concat], axis=1) #this will add columns to res from dfb
    res.columns = list(res.columns)

    for i, ival in enumerate(a):
        mindst = 0x7FFFFFFF #reset mindst on each cycle
        for k, kval in enumerate(b):
            dst = l.distance(a[i], b[k])
            if dst <= limit and dst < mindst: # if current distance less than saved
                mindst = dst
                klist[i] = k

        if klist[i] != None: #if index found than
            res.iloc[i,bcols] = dfb.iloc[klist[i],:] #set dfb columns values in res row to klist[i] row in dfb

    return res

Bug presentation

import pandas as pd
df = pd.DataFrame([['africa',1],['b',2],['down',3]], columns=['key','val'])
df2 = pd.DataFrame([['antica',11],['b',22],['cn',33]], columns=['key','val'])

res = levenstein_merge(df, df2,'key','key')

      key val     key val
0  antica  11  antica  11
1       b   2       b  22
2    down   3      cn  33

      key   val     key  val
0  africa     1  antica   11
1       b     2       b   22
2    down     3      cn   33

If i use different column names in df and df2 - everything is ok. I found that problem is here:

res.iloc[i,bcols] = dfb.iloc[klist[i],:]

res.iloc[i,bcols] for some reason is not working properly.


asked on Stack Overflow Mar 12, 2020 by legale

0 Answers

Nobody has answered this question yet.

User contributions licensed under CC BY-SA 3.0