Из комментариев вы можете увидеть, как я это построил
- используйте
itertools.combinations
, чтобы получить пары NETWORK_ID внутри S C - затем несколько
explode()
, чтобы преобразовать этот список пар в строки - разбить пары на столбцы, чтобы их можно было использовать в
merge()
- все подготовительные работы выполнены - используйте
geopy.distance
для расчета расстояния
Из ваших данных примера я не смог увидеть сопоставление с столбцами, которые вы указали, что хотите. Все просто, теперь у вас есть все допустимые комбинации и расстояния между ними.
import itertools
import geopy.distance
data = """SC NETWORK_ID Lat Lon
0 ED0002 41.6742 26.553130600000003
0 ED0016 41.7750556 26.9170278
0 IS0030 41.05455556 28.80925
0 IS0134 40.97616944 28.80165
1 ED0016 41.7750556 26.9170278
1 IS0096 40.99406111 28.82413056
1 IS0137 40.98033889 28.73052778
1 IS0280 41.1085556 28.874861100000004
2 ED0002 41.6742 26.553130600000003
2 ED0018 41.26675 26.687669399999997
2 IS0034 41.10741944 28.80134167
2 IS0063 40.97868889 28.72448889
2 IS0280 41.1085556 28.874861100000004
2 IS0880 41.03519444 28.82001389
3 ED0018 41.26675 26.687669399999997
3 IB9048 40.98021667 28.833175
3 IS0034 41.10741944 28.80134167
3 IS0063 40.97868889 28.72448889
3 IS0280 41.1085556 28.874861100000004 """
a = [[i for i in l.split(" ") if i!=""] for l in data.split("\n")]
df = pd.DataFrame(a[1:], columns=a[0])
# first get pairs of neigbors within SC
dfd = ( df.groupby("SC").agg({"NETWORK_ID":lambda s: list(itertools.combinations(list(s), 2))})
# generate rows for each pair
.explode("NETWORK_ID").rename({"NETWORK_ID":"PAIR"}, axis=1).reset_index()
# split pairs of locations back into individual column s
.assign(n1=lambda dfa: dfa["PAIR"].apply(lambda s: s[0]),
n2=lambda dfa: dfa["PAIR"].apply(lambda s: s[1]))
# merge back original data so have lat/lon for the pair of locations
.merge(df, left_on=["SC","n1"], right_on=["SC","NETWORK_ID"]).drop("NETWORK_ID", axis=1)
.merge(df, left_on=["SC","n2"], right_on=["SC","NETWORK_ID"]).drop("NETWORK_ID", axis=1)
# finally calc the distance
.assign(miles=lambda dfa: dfa.apply(lambda r:
geopy.distance.geodesic((r["Lat_x"],r["Lon_x"]),
(r["Lat_y"],r["Lon_y"])).miles, axis=1)) )
print(dfd.to_string(index=False))
вывод
SC PAIR n1 n2 Lat_x Lon_x Lat_y Lon_y miles
0 (ED0002, ED0016) ED0002 ED0016 41.6742 26.553130600000003 41.7750556 26.9170278 20.060612
0 (ED0002, IS0030) ED0002 IS0030 41.6742 26.553130600000003 41.05455556 28.80925 124.842785
0 (ED0016, IS0030) ED0016 IS0030 41.7750556 26.9170278 41.05455556 28.80925 110.157080
0 (ED0002, IS0134) ED0002 IS0134 41.6742 26.553130600000003 40.97616944 28.80165 126.496404
0 (ED0016, IS0134) ED0016 IS0134 41.7750556 26.9170278 40.97616944 28.80165 112.408698
0 (IS0030, IS0134) IS0030 IS0134 41.05455556 28.80925 40.97616944 28.80165 5.423670
1 (ED0016, IS0096) ED0016 IS0096 41.7750556 26.9170278 40.99406111 28.82413056 112.821672
1 (ED0016, IS0137) ED0016 IS0137 41.7750556 26.9170278 40.98033889 28.73052778 109.054636
1 (IS0096, IS0137) IS0096 IS0137 40.99406111 28.82413056 40.98033889 28.73052778 4.985179
1 (ED0016, IS0280) ED0016 IS0280 41.7750556 26.9170278 41.1085556 28.874861100000004 111.584270
1 (IS0096, IS0280) IS0096 IS0280 40.99406111 28.82413056 41.1085556 28.874861100000004 8.333445
1 (IS0137, IS0280) IS0137 IS0280 40.98033889 28.73052778 41.1085556 28.874861100000004 11.625086
2 (ED0002, ED0018) ED0002 ED0018 41.6742 26.553130600000003 41.26675 26.687669399999997 28.972852
2 (ED0002, IS0034) ED0002 IS0034 41.6742 26.553130600000003 41.10741944 28.80134167 123.206273
2 (ED0018, IS0034) ED0018 IS0034 41.26675 26.687669399999997 41.10741944 28.80134167 110.732410
2 (ED0002, IS0063) ED0002 IS0063 41.6742 26.553130600000003 40.97868889 28.72448889 122.724622
2 (ED0018, IS0063) ED0018 IS0063 41.26675 26.687669399999997 40.97868889 28.72448889 108.125677
2 (IS0034, IS0063) IS0034 IS0063 41.10741944 28.80134167 40.97868889 28.72448889 9.748491
2 (ED0002, IS0280) ED0002 IS0280 41.6742 26.553130600000003 41.1085556 28.874861100000004 126.809654
2 (ED0018, IS0280) ED0018 IS0280 41.26675 26.687669399999997 41.1085556 28.874861100000004 114.537975
2 (IS0034, IS0280) IS0034 IS0280 41.10741944 28.80134167 41.1085556 28.874861100000004 3.838056
2 (IS0063, IS0280) IS0063 IS0280 40.97868889 28.72448889 41.1085556 28.874861100000004 11.917612
2 (ED0002, IS0880) ED0002 IS0880 41.6742 26.553130600000003 41.03519444 28.82001389 125.846863
2 (ED0018, IS0880) ED0018 IS0880 41.26675 26.687669399999997 41.03519444 28.82001389 112.361444
2 (IS0034, IS0880) IS0034 IS0880 41.10741944 28.80134167 41.03519444 28.82001389 5.078493
2 (IS0063, IS0880) IS0063 IS0880 40.97868889 28.72448889 41.03519444 28.82001389 6.335481
2 (IS0280, IS0880) IS0280 IS0880 41.1085556 28.874861100000004 41.03519444 28.82001389 5.816514
3 (ED0018, IB9048) ED0018 IB9048 41.26675 26.687669399999997 40.98021667 28.833175 113.685179
3 (ED0018, IS0034) ED0018 IS0034 41.26675 26.687669399999997 41.10741944 28.80134167 110.732410
3 (IB9048, IS0034) IB9048 IS0034 40.98021667 28.833175 41.10741944 28.80134167 8.933948
3 (ED0018, IS0063) ED0018 IS0063 41.26675 26.687669399999997 40.97868889 28.72448889 108.125677
3 (IB9048, IS0063) IB9048 IS0063 40.98021667 28.833175 40.97868889 28.72448889 5.684762
3 (IS0034, IS0063) IS0034 IS0063 41.10741944 28.80134167 40.97868889 28.72448889 9.748491
3 (ED0018, IS0280) ED0018 IS0280 41.26675 26.687669399999997 41.1085556 28.874861100000004 114.537975
3 (IB9048, IS0280) IB9048 IS0280 40.98021667 28.833175 41.1085556 28.874861100000004 9.120037
3 (IS0034, IS0280) IS0034 IS0280 41.10741944 28.80134167 41.1085556 28.874861100000004 3.838056
3 (IS0063, IS0280) IS0063 IS0280 40.97868889 28.72448889 41.1085556 28.874861100000004 11.917612
дополнительное обновление
Необходимо иметь дело с крайним случаем, когда S C имеет только один NETWORK_CELLID . Создайте синтезатор c tuple
самому себе.
df = pd.read_csv("3G_Cell_File_.cel", sep="\t")
mycols = ['NETWORK_CELLID', 'SC', 'Lat', 'Lon']
# de-duplicate no point in calculating distance many times...
dfdd = df.loc[:,mycols].drop_duplicates().reset_index(drop=True)
# there are cases where an SC has only one NETWORK_CELLID so there is no combination
# make tuple to itself in this situation
dfd = ( dfdd.groupby("SC").agg({"NETWORK_CELLID":
lambda s: list(itertools.combinations(list(s), 2))
if len(list(s))>1 else [tuple(list(s)*2)]})
# generate rows for each pair
.explode("NETWORK_CELLID").rename({"NETWORK_CELLID":"PAIR"}, axis=1).reset_index()
# split pairs of locations back into individual column s
.assign(n1=lambda dfa: dfa["PAIR"].apply(lambda s: s[0]),
n2=lambda dfa: dfa["PAIR"].apply(lambda s: s[1]))
# merge back original data so have lat/lon for the pair of locations
.merge(dfdd, left_on=["SC","n1"], right_on=["SC","NETWORK_CELLID"]).drop("NETWORK_CELLID", axis=1)
.merge(dfdd, left_on=["SC","n2"], right_on=["SC","NETWORK_CELLID"]).drop("NETWORK_CELLID", axis=1)
# finally calc the distance
.assign(miles=lambda dfa: dfa.apply(lambda r:
geopy.distance.geodesic((r["Lat_x"],r["Lon_x"]),
(r["Lat_y"],r["Lon_y"])).miles, axis=1))
)