Base R, но также должен работать с data.tables. Обратите внимание, что это не сработает, если ваше первое значение - NA после слияния с вашими данными на более низком уровне детализации (т.е. годы <1945), если это так, пусть я знаю, и я переработаю свое решение: </p>
data.frame(do.call("rbind", lapply(split(df, df$name), function(x){
# Find the range: date_range => Date vector
date_range <- range(as.numeric(df$year))
# Generate a sequence, having every date in the range:
# date_lkp => data.frame
date_lkp <- data.frame(year = seq(date_range[1], date_range[2]))
# Merge with date_range: df2 => data.frame
df2 <- merge(date_lkp, df, all.x = TRUE)
# Use linear interpolation to populate missing values:
# value => integer vector
df2$value[which(is.na(df2$value))] <- approx(df2$value,
n = nrow(df2))$y[which(is.na(df2$value))]
# Fill down the missing names: name => character vector
df2$name <- na.omit(df2$name)[cumsum(!(is.na(df2$name)))]
# Define return object: df2 => .GlobalEnv()
return(df2)
}
)
),
stringsAsFactors = FALSE, row.names = NULL)
Данные:
library(data.table)
dt <- fread(
"year name value
1945 USA 110265118
1950 USA 122994019
1955 USA 134001770
1960 USA 150234347
1965 USA 167515758
1970 USA 172867051
1985 WSM 152325
1990 WSM 159500
1995 WSM 161677
2000 WSM 174600
2005 WSM 177510
2010 WSM 180140")
df <- data.frame(dt)
Python используя Pandas:
# Initialise pandas in session:
import pandas as pd
# year => list of integers:
year = [1945, 1950, 1955, 1960, 1965, 1970, 1985, 1990, 1995, 2000, 2005, 2010]
# name => list of strings:
name = ["USA", "USA", "USA", "USA", "USA", "USA", "WSM","WSM","WSM","WSM","WSM","WSM"]
# value => list of integers:
value = [110265118, 122994019, 134001770, 150234347,167515758, 172867051, 152325, 159500, 161677, 174600,177510,
180140]
# Create Data Frame from dict of lists above: df => Data Frame
df = pd.DataFrame({'year': year, 'name': name, 'value': value})
# Function upsampling and interpolating data: upsample_df => function
def upsample_df(g_df):
years = pd.DataFrame({'year': [*range(int(g_df.year.min()), int(g_df.year.max()+1), 1)]})
m_df = pd.merge(years, g_df, how = 'left', on = [*set(years.columns) & set(g_df.columns)])
df2 = m_df.interpolate(method = 'linear')
df2['name'] = df2['name'].ffill()
return(df2)
# Apply function groupwise (according to name): upsampled_df => Data Frame
upsampled_df = df.groupby('name').apply(upsample_df)
# Reset the index of upsampled_df: upsampled_df.index => integer
upsampled_df.index = [*range(1, len(upsampled_df)+1)]
# Output result to console: upsampled_df => stdout (console)
print(upsampled_df.head())