Спасибо за интересную задачу, Проблема решена, найдите решение ниже и следуйте комментариям, не стесняйтесь задавать вопросы.
import pandas as pd
from collections import Counter
df_detailed = pd.DataFrame([
["Fail", "P1", "3 Failed Partition", "X001, X002, X003"],
["Fail", "P1", "Late Backup", "Late Backup"],
["Fail", "P1", "2 Failed Partition", "X001, X002"],
["Fail", "P2", "2 Failed Partition", "X001, X002"],
["Fail", "P2", "Late Backup", "Late Backup"],
["Warn", "P2", "Huge Size", "1GB"],
["Warn", "P2", "Huge Size", "2GB"]
], columns=["Severity", "Partition", "Status", "Comment"])
def change_warn(severity, status):
"""To create a new column where we remove real Status with just Warn message"""
if severity == "Warn":
return "Warn"
else:
return status
df_detailed["Status"] = df_detailed.apply(lambda row: change_warn(row["Severity"], row["Status"]), axis=1)
def remove_leading_digits(x):
if x[0].isdigit():
x = " ".join(x.split(" ")[1:])
return x
df_detailed["Status"] = df_detailed["Status"].apply(lambda x: remove_leading_digits(x))
df_detailed["Comment"] = df_detailed["Comment"].apply(lambda x: x + ",") # we need it since we will sum the columns then
# need to combine to distinguish P1 from P2:
df_detailed["TempStatus"] = df_detailed["Partition"] + " " + df_detailed["Status"]
gr_b = df_detailed[["Partition", "TempStatus", "Comment"]].groupby("TempStatus").sum()
def calculate_unique_comment(status, comment):
comments = []
if status.endswith("Failed Partition"):
for c in comment.split(","):
if c != "":
comments.append(c.strip())
counter = Counter(comments)
return str(len(counter.keys()))
else:
return str(0)
del gr_b["Partition"] # do not need it
gr_b = gr_b.reset_index() # otherwise get problem
gr_b["CountUnCom"] = gr_b.apply(lambda row: calculate_unique_comment(row["TempStatus"], row["Comment"]), axis=1)
# let's find of unique comments per Partion for Failed partition and put them in dict
part_dict = {}
for i in range(len(gr_b)):
if gr_b["TempStatus"][i].endswith("Failed Partition"):
part_dict[gr_b["TempStatus"][i]] = gr_b["CountUnCom"][i]
# let's take only what we need to work with
df_small = pd.DataFrame(df_detailed[["Partition", "Status"]])
df_small["Status"] = df_small["Status"].apply(lambda x: x + ",") # to sum and split later
gr_df_small = df_small.groupby("Partition").sum()
gr_df_small = gr_df_small.reset_index()
def convert_status_to_list(status):
new_status = []
for c in status.split(","):
if c != "":
new_status.append(c.strip())
return new_status
gr_df_small["Status"] = gr_df_small["Status"].apply(lambda x: convert_status_to_list(x))
def calculate_status(partition, status, x):
result = []
for k, v in Counter(status).items():
if k == "Failed Partition":
v = x[partition + " " + "Failed Partition"]
result.append(f"{v} {k}")
return " ".join(result)
gr_df_small["Status"] = gr_df_small.apply(lambda row: calculate_status(row["Partition"], row["Status"], part_dict), axis=1)
print(gr_df_small)
Вывод:
Partition Status
0 P1 3 Failed Partition 1 Late Backup
1 P2 2 Failed Partition 1 Late Backup 2 Warn