标签:types cli port cos DPoS strong first 读取 文件中

def read_file(
file_path, use_col=None, converters=None):
"""
:param file_path: 读取路径
:param use_col: 筛选列
:param converters:
:return:
"""
if re.search(‘\.xlsx‘, file_path):
data = pd.read_excel(file_path,
usecols=use_col,
converters=converters)
return data
elif re.search(‘\.csv‘, file_path):
try:
data = pd.read_csv(file_path,
usecols=use_col,
converters=converters
)
except OSError:
data = pd.read_csv(file_path,
engine=‘python‘,
usecols=use_col,
converters=converters
)
except UnicodeDecodeError:
data = pd.read_csv(open(file_path,
encoding=‘gbk‘),
usecols=use_col,
converters=converters
)
except Exception as e:
raise Exception(e)
return data
elif re.search(‘\.pkl‘, file_path):
data = pd.read_pickle(file_path)
return data
else:
raise Exception(‘文件类型错误,仅支持csv、xlsx、‘
‘pickle格式‘)
# pivot的再封装
def pivot_table(data, index, values, **kwargs):
data[values] = data[values].astype(‘float‘)
data[index] = data[index].astype(‘str‘)
return pd.pivot_table(data,
index=index,
values=values,
**kwargs)
在pivot之后,常出现多个二维columns的情况
cols = [(‘ftime‘, ‘‘),
(‘uid‘, ‘‘),
(‘agent_id‘, ‘‘),
(‘industry_name_level1‘, ‘‘),
(‘product_type‘, ‘‘),
(‘adpos‘, ‘‘),
(‘trace_cnt‘, ‘0‘),
(‘trace_cnt‘, ‘PRODUCTTYPE_APPLE_APP_STORE‘),
(‘trace_cnt‘, ‘PRODUCTTYPE_JD_URL‘),
(‘trace_cnt‘, ‘PRODUCTTYPE_LEAD_AD‘),
(‘trace_cnt‘, ‘PRODUCTTYPE_OPEN_PLATFORM_APP_MOB‘),
(‘trace_cnt‘, ‘PRODUCTTYPE_WECHAT‘)]
如何处理呢?
def col_rename(list_MultiIndex):
list_name=[]
for colname in list_MultiIndex:
colnewname=str(colname[1])+str(colname[0])
list_name.append(colnewname)
return list_name
def combine_pivot_col_name(single_col):
‘‘‘
用于处理pivot返回的二维columns
:param single_col:
:return:
‘‘‘
if single_col[1] in (‘‘, ‘0‘):
return single_col[0]
else:
return single_col[1]
>> [‘ftime‘, ‘uid‘, ‘agent_id‘, ‘industry_name_level1‘, ‘product_type‘, ‘adpos‘, ‘trace_cnt‘, ‘PRODUCTTYPE_APPLE_APP_STORE‘, ‘PRODUCTTYPE_JD_URL‘, ‘PRODUCTTYPE_LEAD_AD‘, ‘PRODUCTTYPE_OPEN_PLATFORM_APP_MOB‘, ‘PRODUCTTYPE_WECHAT‘]
b = list(map(lambda x: x[len(x[0]) < len(x[1])], cols)) >>[‘ftime‘, ‘uid‘, ‘agent_id‘, ‘industry_name_level1‘, ‘product_type‘, ‘adpos‘, ‘trace_cnt‘, ‘PRODUCTTYPE_APPLE_APP_STORE‘, ‘PRODUCTTYPE_JD_URL‘, ‘PRODUCTTYPE_LEAD_AD‘, ‘PRODUCTTYPE_OPEN_PLATFORM_APP_MOB‘, ‘PRODUCTTYPE_WECHAT‘]
创建一个对象,是一个完整的个体,有属性和方法等接口
revenueday_select_col=[‘ftime‘,‘fuid‘,‘adv_name‘,‘agent_id‘,‘agent_name‘,‘appid_id‘,‘KPI_first_ind‘,‘new_first_industry‘,
‘new_second_industry‘,‘show‘, ‘click‘,‘real_cost‘,‘is_smb‘,‘adv_sale_tag_name‘,‘sign‘,
‘f_year‘,‘f_month‘,‘f_M‘,‘f_Q‘,‘f_sweek‘,‘f_yweek‘,‘f_YW‘,‘first_year‘,‘first_month‘,‘first_sweek‘,
‘product‘,‘flow_name_level2‘,‘product2‘,‘R_Group‘,‘R_emp‘, ‘big_area‘,‘province_manual‘,‘city‘,
‘Q_new_old‘,‘is_tc‘,‘lingyu‘,‘is_industry_one‘,‘is_industry_two‘,
‘first_cost_date‘,‘bidtype‘,‘CZ_amount‘,‘year_new_old‘]
revenueday = pd.read_csv(r‘E:/行业效果数据和OCPA数据/数据源/revenueday/revenue_new_day.csv‘, encoding=‘gbk‘, usecols=revenueday_select_col)
revenueday.rename(columns={‘appid_id‘:‘wechatappid‘,‘new_first_industry‘:‘three_ind1‘,‘new_second_industry‘:‘three_ind2‘,‘is_smb‘:‘is_smb_original‘,
‘show‘:‘exposure_cnt‘,‘click‘: ‘click_cnt‘, ‘product‘: ‘type‘, ‘province_manual‘:‘province_area_manual‘}, inplace=True)
product_type_yinshe = pd.read_excel(r‘E:/行业效果数据和OCPA数据/映射表/产品类型映射表.xlsx‘)
track_yinshe = pd.read_excel(r‘E:/行业效果数据和OCPA数据/映射表/效果映射表.xlsx‘)
flow_yinshe = pd.read_excel(r‘E:/行业效果数据和OCPA数据/映射表/站点映射表.xlsx‘)
gzh_yinshe = pd.read_csv(r‘E:/行业效果数据和OCPA数据/映射表/公众号名称.csv‘, encoding=‘gbk‘, usecols=[‘fuid‘,‘昵称‘])
desttype_yinshe = pd.read_excel("E:/行业效果数据和OCPA数据/映射表/落地页映射表.xlsx")
creative_size_yinshe = pd.read_excel("E:/行业效果数据和OCPA数据/映射表/素材尺寸映射表.xlsx")
## 处理映射表
product_type_yinshe.info()
product_type_yinshe.loc[:, ‘product_type‘] = product_type_yinshe.loc[:,‘product_type‘].apply(lambda x: str(x))
product_type_yinshe.product_type.unique()
track_yinshe.info()
track_yinshe.loc[:, ‘acttion_track_type‘] = track_yinshe.loc[:,‘acttion_track_type‘].apply(lambda x: str(x))
track_yinshe.acttion_track_type.unique()
gzh_yinshe.info()
gzh_yinshe.loc[:, ‘fuid‘] = gzh_yinshe.loc[:, ‘fuid‘].astype(‘int64‘).astype(str)
gzh_yinshe.fuid.unique()
gzh_yinshe.drop_duplicates(subset=[‘fuid‘], keep=‘last‘, inplace=True)
desttype_yinshe.info()
desttype_yinshe.loc[:, ‘desttype‘] = desttype_yinshe.loc[:, ‘desttype‘].apply(lambda x: str(x))
desttype_yinshe.desttype.unique()
creative_size_yinshe.info()
creative_size_yinshe.loc[:, ‘creative_size‘] = creative_size_yinshe.loc[:, ‘creative_size‘].apply(lambda x: str(x))
creative_size_yinshe.creative_size.unique()
flow_yinshe.info()
### 处理revenueday
revenueday.info()
revenueday.replace([np.nan, np.inf, ‘NA‘, ‘-‘, ‘nan‘], [0, 0, 0, 0, 0], inplace=True)
if revenueday.loc[:, ‘fuid‘].dtypes != ‘object‘:
revenueday.loc[:, ‘fuid‘] = revenueday[:, ‘fuid‘].astype(str)
revenueday[‘f_year‘] = revenueday[‘f_year‘].astype(‘int64‘).astype(str)
revenueday[‘f_yweek‘] = revenueday[‘f_yweek‘].astype(‘int64‘).astype(str)
revenueday[‘f_month‘] = revenueday[‘f_month‘].astype(‘int64‘).astype(str)
revenueday[‘f_yweek‘].unique()
is_ind = revenueday[‘KPI_first_ind‘].isin(industry)
is_adv_sale_tag_name = revenueday[‘adv_sale_tag_name‘].isin([‘地方站‘])
is_year_sel = revenueday[‘f_year‘].isin(Y) if Y else Y
is_quarter_sel = revenueday[‘f_Q‘].isin(quarter) if quarter else quarter
is_month_sel = revenueday[‘f_month‘].isin(month) if month else month
is_yweek_sel = revenueday[‘f_yweek‘].isin(yweek) if yweek else yweek
selector = is_ind & ~is_adv_sale_tag_name
for sel_date in [is_year_sel, is_quarter_sel, is_month_sel, is_yweek_sel]:
if sel_date is not None:
selector &= sel_date
revenueday = revenueday.loc[selector, :]
revenueday.KPI_first_ind.unique()
revenueday.adv_sale_tag_name.unique()
revenueday.f_sweek.unique()
revenueday.type.unique()
revenueday.loc[:,‘agent_id‘] = revenueday.loc[:,‘agent_id‘].apply(lambda x:str(x))
revenueday.loc[:,‘fuid‘] = revenueday.loc[:,‘fuid‘].astype(str)
revnueday_groupby_scol = [‘fuid‘,‘wechatappid‘,‘adv_name‘,‘agent_id‘,‘agent_name‘,‘KPI_first_ind‘,‘three_ind1‘,
‘three_ind2‘,‘is_smb_original‘,‘big_area‘,‘province_area_manual‘]
revenueday = revenueday.groupby(revnueday_groupby_scol).agg({‘real_cost‘:sum}).reset_index().fillna(‘0‘)
revenueday.drop([‘real_cost‘], axis=1, inplace=True)
revenueday = revenueday.loc[~(revenueday[‘fuid‘].isin([‘0‘,0])),:]
revenueday.info()
gzh_yinshe.info()
revenueday.loc[:, revenueday.columns] = revenueday.loc[:,revenueday.columns].apply(lambda x:x.astype(str))
revenueday = pd.merge(revenueday,gzh_yinshe,on =[‘fuid‘], how=‘left‘)
revenueday.rename(columns={‘昵称‘: ‘公众号名称‘}, inplace=True)
revenueday.loc[:, ‘公众号名称‘].fillna(‘0‘, inplace=True)
revenueday.drop_duplicates(subset=[‘fuid‘,‘agent_id‘],keep = ‘last‘,inplace=True)
revenueday.info()
revenueday.loc[:, revenueday.columns] = revenueday.loc[:, revenueday.columns].apply(lambda x:x.astype(str))
## 处理GDT的数据
## 处理GDT的数据
## 处理GDT的数据
### 处理 greenspan
revenueday_id = revenueday.fuid.unique().tolist()
class RevenueDay:
"""
读取并处理周报数据源
"""
use_col = [‘ftime‘, ‘fuid‘, ‘adv_name‘, ‘agent_id‘, ‘agent_name‘, ‘appid_id‘,
‘KPI_first_ind‘, ‘new_first_industry‘, ‘new_second_industry‘,
‘show‘, ‘click‘, ‘real_cost‘, ‘is_smb2‘, ‘adv_sale_tag_name‘, ‘sign‘,
‘f_year‘, ‘f_month‘, ‘f_M‘, ‘f_Q‘, ‘f_sweek‘, ‘f_yweek‘, ‘f_YW‘, ‘first_year‘,
‘first_month‘, ‘first_sweek‘, ‘product‘, ‘flow_name_level2‘, ‘product2‘, ‘R_Group‘,
‘R_emp‘, ‘big_area‘, ‘province_manual‘, ‘city‘, ‘Q_new_old‘, ‘is_tc‘, ‘first_cost_date‘, ‘CZ_amount‘]
# ‘lingyu‘, ‘is_industry_one‘, ‘is_industry_two‘, ‘bidtype‘, ‘year_new_old‘
REVENUE_DATA = read_file(os.path.join(DATA_CENTER[‘REVENUE_DATA‘], ‘revenue_day.pkl‘))[use_col]
REVENUE_DATA.rename(columns={
‘appid_id‘: ‘wechatappid‘,
‘new_first_industry‘: ‘three_ind1‘,
‘new_second_industry‘: ‘three_ind2‘,
‘is_smb2‘: ‘is_smb_original‘,
‘show‘: ‘exposure_cnt‘,
‘click‘: ‘click_cnt‘,
‘product‘: ‘type‘,
‘province_manual‘: ‘province_area_manual‘
}, inplace=True)
REVENUE_DATA.replace([np.nan, np.inf, ‘NA‘, ‘-‘, ‘nan‘], [0, 0, 0, 0, 0], inplace=True) # 处理nan值
REVENUE_DATA[[‘fuid‘, ‘f_year‘, ‘f_yweek‘, ‘f_month‘]] = REVENUE_DATA[
[‘fuid‘, ‘f_year‘, ‘f_yweek‘, ‘f_month‘]].astype(‘int64‘).astype(str)
print(‘读入revenueday‘)
def __init__(self, quarter, year, **kwargs):
self.quarter = quarter
self.year = year
self.revenueday = RevenueDay.REVENUE_DATA
self.kwargs = kwargs
def charge_type(self, key):
if isinstance(key, str):
return [key]
elif isinstance(key, int):
return [str(key)]
elif isinstance(key, list):
return key
else:
raise KeyError(‘无法识别键值!‘)
def check_variable_type(self):
for key, value in self.kwargs.items():
self.kwargs[key] = self.charge_type(value)
self.quarter = self.charge_type(self.quarter)
self.year = self.charge_type(self.year)
def get_select_array(self):
year_sel = self.revenueday[‘f_year‘].isin(self.year) if self.year else self.year
sel_col_mapping = {
‘KPI_first_ind‘: self.kwargs.get(‘industry‘),
‘f_Q‘: self.quarter,
‘f_month‘: self.kwargs.get(‘month‘),
‘f_week‘: self.kwargs.get(‘week‘)
}
for sel_col, variable in sel_col_mapping.items():
if variable:
year_sel &= self.revenueday[sel_col].isin(variable)
return year_sel
def get_revenue_data(self, group_by_cols):
self.check_variable_type()
revenue = self.revenueday.loc[self.get_select_array(), :].copy()
revenue = revenue.groupby(group_by_cols).agg({‘real_cost‘: sum}).reset_index().fillna(
‘0‘).sort_values(by=[‘fuid‘, ‘real_cost‘])
revenue.drop([‘real_cost‘], axis=1, inplace=True)
revenue = revenue.loc[~(revenue[‘fuid‘].isin([‘0‘, 0])), :]
revenue = pd.merge(revenue, OfficialAccount().official_account_mapping, on=[‘fuid‘], how=‘left‘)
revenue.loc[:, ‘公众号名称‘].fillna(‘0‘, inplace=True)
revenue.rename(columns={‘公众号名称‘: ‘app_name‘}, inplace=True)
revenue.loc[:, :] = revenue.loc[:, :].astype(str)
return revenue
@property
def revenue_data(self):
groupby_fuid_agent = [‘fuid‘, ‘wechatappid‘, ‘adv_name‘, ‘agent_id‘, ‘agent_name‘, ‘KPI_first_ind‘,
‘three_ind1‘, ‘three_ind2‘, ‘is_smb_original‘, ‘big_area‘, ‘province_area_manual‘]
revenue = self.get_revenue_data(groupby_fuid_agent)
revenue = revenue.drop_duplicates(subset=[‘fuid‘, ‘agent_id‘], keep=‘last‘)
return revenue
@property
def revenue_data_unique_fuid(self):
groupby_fuid = [‘fuid‘, ‘wechatappid‘, ‘adv_name‘, ‘agent_id‘, ‘agent_name‘, ‘KPI_first_ind‘,
‘three_ind1‘, ‘three_ind2‘, ‘is_smb_original‘, ‘big_area‘, ‘province_area_manual‘]
revenue = self.get_revenue_data(groupby_fuid)
revenue = revenue.drop_duplicates(subset=[‘fuid‘], keep=‘last‘)
return revenue
# @property
# def revenue_id(self):
# return self.revenue_data[‘fuid‘].unique().tolist()
product_type_yinshe = pd.read_excel(r‘E:/行业效果数据和OCPA数据/映射表/产品类型映射表.xlsx‘) track_yinshe = pd.read_excel(r‘E:/行业效果数据和OCPA数据/映射表/效果映射表.xlsx‘) flow_yinshe = pd.read_excel(r‘E:/行业效果数据和OCPA数据/映射表/站点映射表.xlsx‘)
这种情况在一个文件中有23个*5个文件=115次


PATH = ‘E:/code_piggy/kiki_env/my_env/TsToolkit/report/行业效果数据和OCPA数据‘ PATH_DATA_SOURCE = os.path.join(PATH, ‘数据源‘) PATH_MAPPING = os.path.join(PATH, ‘映射表‘) PATH_RESULT = os.path.join(PATH, ‘结果表‘) product_type_yinshe = pd.read_excel(PATH_MAPPING + ‘/产品类型映射表.xlsx‘, dtype=str) track_yinshe = pd.read_excel(PATH_MAPPING + ‘/效果映射表.xlsx‘,dtype=str)
from settings import DATA_CENTER, PATH_MATCHED, PATH_SOURCE
revenueday_id = revenueday.fuid.unique().tolist() filehome=greenspan_file_name fileall=os.listdir(filehome)
=, -, +=, ==, in, is not, and ...
RevenueData = pd.DataFrame()
revenueData = pd.DataFrame()
revenue_data = pd.DataFrame()

标签:types cli port cos DPoS strong first 读取 文件中
原文地址:https://www.cnblogs.com/pikiki/p/13092409.html