-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy patharticle2_originality.py
More file actions
108 lines (97 loc) · 5.01 KB
/
Copy patharticle2_originality.py
File metadata and controls
108 lines (97 loc) · 5.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import pandas as pd
import matplotlib.pyplot as plt
from article0_datasets import hoe_meta_df
from config_visuals import sns, save_image_in_all_formats
# Only consider 'first' editions, to get originality of new publications, not of all
originality_by_decade_df = hoe_meta_df.loc[hoe_meta_df['sequence'] == 'first'][['publication_decade', 'originality']]
originality_summary = originality_by_decade_df.groupby('publication_decade')['originality'].mean().reset_index()
originality_summary.rename({'originality': 'mean'}, inplace=True, axis=1)
originality_median = originality_by_decade_df.groupby('publication_decade')['originality'].median().reset_index()
originality_median.rename({'originality': 'median'}, inplace=True, axis=1)
originality_sd = originality_by_decade_df.groupby('publication_decade')['originality'].std().reset_index()
originality_sd.rename({'originality': 'sd'}, inplace=True, axis=1)
originality_titles = originality_by_decade_df.groupby('publication_decade')['originality'].size().reset_index()
originality_titles.rename({'originality': 'titles'}, inplace=True, axis=1)
originality_summary = originality_summary.merge(originality_median, on='publication_decade', how='left')
originality_summary = originality_summary.merge(originality_sd, on='publication_decade', how='left')
originality_summary = originality_summary.merge(originality_titles, on='publication_decade', how='left')
# ----------
# Figure 2.2
# ----------
# Originality histograms by decade
originality_grouped = hoe_meta_df.loc[hoe_meta_df['sequence'] == 'first'][['publication_decade', 'originality']]
originality_grouped = originality_grouped.groupby(['publication_decade',
pd.cut(originality_grouped['originality'],
bins=[0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],
labels=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90])]).size().reset_index()
originality_grouped.rename({0: 'count'}, axis=1, inplace=True)
originality_grouped['%'] = (100 * originality_grouped['count'] /
originality_grouped.groupby('publication_decade')['count'].transform('sum'))
originality_grouped['originality'] = originality_grouped['originality'].astype(int)
plot_date_lower = 1650
plot_hist_subset = originality_grouped.loc[
originality_grouped['publication_decade'] >= plot_date_lower]
# Originality histogram combined
ax = sns.barplot(plot_hist_subset,
x = 'originality',
y = '%',
order=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90],
errorbar=None
)
# ax.set_title("First Edition Originality")
ax.set_xlabel("Originality (%)")
ax.set_ylabel("Share (%)")
plt.tight_layout()
save_image_in_all_formats(plt, 'Fig.2.2')
# ----------
# Figure 2.3
# ----------
plot_date_lower = 1650
mean_or_median = 'mean'
plot_subset = originality_summary.loc[originality_summary['publication_decade'] >= plot_date_lower]
plot_subset['lower_bound'] = plot_subset[mean_or_median] - plot_subset['sd']
plot_subset['upper_bound'] = plot_subset[mean_or_median] + plot_subset['sd']
plt.fill_between(plot_subset['publication_decade'],
plot_subset['lower_bound'],
plot_subset['upper_bound'], linestyle='', color='0.9')
ax = sns.lineplot(data=plot_subset,
x="publication_decade", y=mean_or_median)
# ax.set_title("Mean Originality")
ax.set_xlabel("Decade")
ax.set_ylabel("Originality (Mean)")
plt.tight_layout()
save_image_in_all_formats(plt, 'Fig.2.3')
# ----------
# Figure 2.4
# ----------
# originality histograms by decade
plot_hist_subset['Decade'] = plot_hist_subset['publication_decade']
g = sns.FacetGrid(plot_hist_subset,
col="Decade", col_wrap=5, height=3,
ylim=(0, 60)
)
g.map(sns.barplot, "originality", "%",
order=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90])
g.set_xlabels("Originality (%)")
g.set_ylabels("Share (%)")
# g.figure.suptitle('Originality by Decade')
plt.tight_layout()
save_image_in_all_formats(plt, 'Fig.2.4')
# ----------
# Table 2.1
# ----------
# Originality by authors
og_by_authors = hoe_meta_df.copy()
og_by_authors = og_by_authors.loc[og_by_authors['sequence'] == 'first']
og_by_authors_sum = og_by_authors.groupby(['actor_id', 'name_unified'])['originality'].mean().reset_index()
og_by_authors_median = og_by_authors.groupby(['actor_id'])['originality'].median().reset_index()
og_by_authors_median.rename({'originality': 'median'}, inplace=True, axis=1)
og_by_authors_sum = og_by_authors_sum.merge(og_by_authors_median, on=['actor_id'])
og_by_authors_sd = og_by_authors.groupby(['actor_id'])['originality'].std().reset_index()
og_by_authors_sd.rename({'originality': 'sd'}, inplace=True, axis=1)
og_by_authors_sum = og_by_authors_sum.merge(og_by_authors_sd, on=['actor_id'])
title_counts = hoe_meta_df.groupby('actor_id').size().reset_index()
og_by_authors_sum = og_by_authors_sum.merge(title_counts, on=['actor_id'])
og_by_authors_sum.rename({0: 'titles'}, inplace=True, axis=1)
og_by_authors_sum.sort_values('titles', ascending=False, inplace=True)
og_by_authors_sum.to_csv('data/work/originality_by_authors.csv', index=False)