The MAG dataset links keywords and their corresponding field of study. Moreover, the dataset provides us with hierarchical data that links these research fields with their parent research fields in up to 4 levels (L0-L3). In this notebook, we will use the research field data to better understand how various fields of study change over time. First, as in previous notebooks, let's load the required Python packages.
from configs import *
import pandas as pd
import numpy as np
import altair as alt
alt.renderers.enable('notebook')
from visualization.visual_utils import *
import turicreate.aggregate as agg
FIELD_OF_STUDY_HIERARCHY = "%s/FieldOfStudyHierarchy.sframe" % SFRAMES_BASE_DIR
def normalize_features_dict(feature_dict, start_year):
d = {}
feature_dict = {(y - start_year):v for y,v in feature_dict.iteritems()}
return feature_dict
def get_values_sum_by_year_dict(d, max_keys):
d2 = {}
for i in range(max_keys):
d2[i] = sum([v for k,v in d.iteritems() if k <= i])
return d2
fields_sf = tc.load_sframe(EXTENDED_PAPERS_SFRAME)["Paper ID", "Paper publish year","Fields of study parent list names (L0)",
"Fields of study parent list (L1)", "Fields of study parent list names (L1)",
"Fields of study parent list (L2)", "Fields of study parent list names (L2)",
"Ref Number", "Authors Number", "Total Citations by Year"]
fields_sf = fields_sf.rename({"Paper publish year": "Year"})
fields_sf = filter_sframe_by_years(fields_sf, 1850, 2010)
fields_sf = fields_sf[fields_sf["Fields of study parent list names (L0)"] != None]
fields_sf = fields_sf[fields_sf["Fields of study parent list names (L0)"].apply(lambda l: len(l) <= 10)] # remove papers that belong to more than 10 fields
fields_sf = fields_sf[fields_sf["Ref Number"] >= 5]
fields_sf.materialize()
Let's create SFrame with the information on all L0 fields of study.
# Creating the L0 SFrame
l0_sf = fields_sf["Paper ID", "Year","Fields of study parent list names (L0)", "Authors Number", "Ref Number","Total Citations by Year"].stack("Fields of study parent list names (L0)", new_column_name="L0 Field")
l0_sf = l0_sf[l0_sf["L0 Field"] != None]
l0_sf.materialize()
l0_sf
Now let's explore how the various trends change in each research field, and we will add an SFrame which includes all the L1 research fields under Biology.
# Creating Biology L1 Sframe
f_sf = tc.load_sframe(FIELDS_OF_STUDY_SFRAME)
bio_id = f_sf[f_sf["Field of study name"] == "Biology"]["Field of study ID"][0]
h_sf = tc.load_sframe(FIELD_OF_STUDY_HIERARCHY)
h_sf = h_sf[h_sf["Parent field of study ID"] == bio_id]
bio_l1_ids = set(h_sf[h_sf["Child field of study level"] == "L1"]["Child field of study ID"])
bio_l1_names_sf = f_sf[f_sf["Field of study ID"].apply(lambda i: i in bio_l1_ids)]
bio_l1_dict = {r["Field of study ID"]: r["Field of study name"] for r in bio_l1_names_sf}
sf = fields_sf["Paper ID", "Year","Fields of study parent list names (L0)", "Fields of study parent list (L1)", "Authors Number", "Ref Number", "Total Citations by Year"]
l1_bio_sframe = sf[sf["Fields of study parent list (L1)"] != None]
l1_bio_sframe = l1_bio_sframe.stack("Fields of study parent list (L1)", new_column_name="L1 Field ID" )
l1_bio_sframe = l1_bio_sframe[l1_bio_sframe["L1 Field ID"].apply(lambda i: i in bio_l1_ids)]
l1_bio_sframe = l1_bio_sframe[l1_bio_sframe["L1 Field ID"] != None]
l1_bio_sframe["L1 Field"] = l1_bio_sframe["L1 Field ID"].apply(lambda i: bio_l1_dict[i])
l1_bio_sframe
# Creating Biology L2 Sframe for
gen_id = f_sf[f_sf["Field of study name"] == "Genetics"]["Field of study ID"][0]
h_sf = tc.load_sframe(FIELD_OF_STUDY_HIERARCHY)
h_sf = h_sf[h_sf["Parent field of study ID"] == gen_id]
gen_l2_ids = set(h_sf[h_sf["Child field of study level"] == "L2"]["Child field of study ID"])
gen_l2_names_sf = f_sf[f_sf["Field of study ID"].apply(lambda i: i in gen_l2_ids)]
gen_l2_dict = {r["Field of study ID"]: r["Field of study name"] for r in gen_l2_names_sf}
sf = fields_sf["Paper ID", "Year", "Fields of study parent list (L2)", "Authors Number", "Ref Number", "Total Citations by Year"]
l2_gen_sframe = sf[sf["Fields of study parent list (L2)"] != None]
l2_gen_sframe = l2_gen_sframe.stack("Fields of study parent list (L2)", new_column_name="L2 Field ID" )
l2_gen_sframe = l2_gen_sframe[l2_gen_sframe["L2 Field ID"].apply(lambda i: i in gen_l2_ids)]
l2_gen_sframe = l2_gen_sframe[l2_gen_sframe["L2 Field ID"] != None]
l2_gen_sframe["L2 Field"] = l2_gen_sframe["L2 Field ID"].apply(lambda i: gen_l2_dict[i])
l2_gen_sframe
g = l0_sf.groupby(["Year", "L0 Field"], {"Number of Papers": agg.COUNT()})
df = g.to_dataframe()
df = df.sort_values(["L0 Field", "Year"])
chart = alt.Chart(df).mark_line().encode(
alt.X('Year:Q', axis=alt.Axis(format='d'), scale=alt.Scale(zero=False)),
alt.Y('Number of Papers:Q', scale=alt.Scale(zero=False)),
color="L0 Field"
)
chart
c = sns.FacetGrid(df, col="L0 Field", sharex=True, sharey=True, col_wrap=5)
c.map(plt.plot, "Year", "Number of Papers", alpha=.7).set_titles("{col_name}")
g[g["Year"] == 2010].sort("Number of Papers").print_rows(30)
We can observe that some fields such as Medicine, Physics, and CS had a surge in the number of papers, while other fields such as Political Science, Art, and Philosophy only had limited growth. Let's see what the number of papers over time are for various Biology subfields.
g = l1_bio_sframe.groupby(["Year", "L1 Field"], {"Number of Papers": agg.COUNT()})
df = g.to_dataframe()
df = df.sort_values(["L1 Field", "Year"])
c = sns.FacetGrid(df, col="L1 Field", sharex=True, sharey=True, col_wrap=5)
c.map(plt.plot, "Year", "Number of Papers", alpha=.7, color='g').set_titles("{col_name}")
From the above chart, we can observe that the various subfields of Biology have a large difference in the number of published papers. For example, in Genetics more than 114,532 papers were published in 2010, while in Virology the number of papers was only 2,597.
g = l2_gen_sframe.groupby(["Year", "L2 Field"], {"Number of Papers": agg.COUNT()})
df = g.to_dataframe()
df = df.sort_values(["L2 Field", "Year"])
c = sns.FacetGrid(df, col="L2 Field", sharex=True, sharey=True, col_wrap=5)
c.map(plt.plot, "Year", "Number of Papers", alpha=.7, color="orange").set_titles("{col_name}")
Let's measure the average number of authors over time for the various research fields.
g = l0_sf.groupby(["Year", "L0 Field"], {"Average Number of Authors": agg.AVG("Authors Number")})
df = g.to_dataframe()
df = df.sort_values(["L0 Field", "Year"])
c = sns.FacetGrid(df, col="L0 Field", sharex=True, sharey=True, col_wrap=5)
c.map(plt.plot, "Year", "Average Number of Authors", alpha=.7).set_titles("{col_name}")
We can observe that across all research fields the average number of authors has increased sharply over the last century. However, while the average number of authors in Medicine and Biology is over 5 authors, in fields such as Political Science, Sociology, and Philosophy the average number of authors is about 3. Let's now observe how the average number of authors in Biology subfields has changed over time.
g = l1_bio_sframe.groupby(["Year", "L1 Field"], {"Average Number of Authors": agg.AVG("Authors Number")})
df = g.to_dataframe()
df = df.sort_values(["L1 Field", "Year"])
c = sns.FacetGrid(df, col="L1 Field", sharex=True, sharey=True, col_wrap=5)
c.map(plt.plot, "Year", "Average Number of Authors", alpha=.7,color='g').set_titles("{col_name}")
g = l2_gen_sframe.groupby(["Year", "L2 Field"], {"Average Number of Authors": agg.AVG("Authors Number")})
df = g.to_dataframe()
df = df.sort_values(["L2 Field", "Year"])
c = sns.FacetGrid(df, col="L2 Field", sharex=True, sharey=True, col_wrap=5)
c.map(plt.plot, "Year", "Average Number of Authors", alpha=.7, color="orange").set_titles("{col_name}")
We can also observe a wide diversity in the average number of authors in various Biology subfields.
Let's calculate the average number of references in the L0 research fields, and in the Biology subfields.
g = l0_sf.groupby(["Year", "L0 Field"], {"Average References Number": agg.AVG("Ref Number")})
df = g.to_dataframe()
df = df.sort_values(["L0 Field", "Year"])
c = sns.FacetGrid(df, col="L0 Field", sharex=True, sharey=True, col_wrap=5)
c.map(plt.plot, "Year", "Average References Number", alpha=.7).set_titles("{col_name}")
It seems that across all the research fields the average number of references per paper has increased sharply in recent years. However, there are research fields, such as Biology, in which the average number of references is above 30, while in other fields, such as Mathematics, the average number of references is about 20. Let's observe what the average number of references is in Biology subfields:
g = l1_bio_sframe.groupby(["Year", "L1 Field"], {"Average References Number": agg.AVG("Ref Number")})
df = g.to_dataframe()
df = df.sort_values(["L1 Field", "Year"])
c = sns.FacetGrid(df, col="L1 Field", sharex=True, sharey=True, col_wrap=5)
c.map(plt.plot, "Year", "Average References Number", alpha=.7, color="g").set_titles("{col_name}")
As in previous results, we can observe that the average number of references has considerably increased over all the Biology subfields. However, while papers in some subfields increased to over 40 references, papers in other subfields had many fewer references.
g = l2_gen_sframe.groupby(["Year", "L2 Field"], {"Average References Number": agg.AVG("Ref Number")})
df = g.to_dataframe()
df = df.sort_values(["L2 Field", "Year"])
c = sns.FacetGrid(df, col="L2 Field", sharex=True, sharey=True, col_wrap=5)
c.map(plt.plot, "Year", "Average References Number", alpha=.7,color="orange").set_titles("{col_name}")
Let's calculate the average and median number of citationsin the L0 research fields, and in the Biology subfields.
def citations_after_years(citations_dict, year, after_years=5):
if citations_dict is None:
return 0
l = [v for k,v in citations_dict.iteritems() if (int(k) <= (year + after_years))] # it is about 5-6 years
if len(l) == 0:
return 0
return max(l)
l0_sf['Total Ciations After 5 Years'] = l0_sf.apply(lambda r: citations_after_years(r['Total Citations by Year'],
r["Year"], 5))
g = l0_sf.groupby(["Year", "L0 Field"], {"Average Citations after 5 Years": agg.AVG('Total Ciations After 5 Years'),
"Citations after 5 Years List": agg.CONCAT('Total Ciations After 5 Years')})
g["Median Citations after 5 Years"] = g["Citations after 5 Years List"].apply(lambda l: np.median(l))
g = g.remove_column("Citations after 5 Years List")
df = g.to_dataframe()
df = df.sort_values(["L0 Field", "Year"])
c = sns.FacetGrid(df, col="L0 Field", sharex=True, sharey=True, col_wrap=5)
c.map(plt.plot, "Year", "Median Citations after 5 Years", alpha=.7).set_titles("{col_name}")
c = sns.FacetGrid(df, col="L0 Field", sharex=True, sharey=True, col_wrap=5)
c.map(plt.plot, "Year", "Average Citations after 5 Years", alpha=.7).set_titles("{col_name}")
l1_bio_sframe['Total Ciations After 5 Years'] = l1_bio_sframe.apply(lambda r: citations_after_years(r['Total Citations by Year'],
r["Year"], 5))
g = l1_bio_sframe.groupby(["Year", "L1 Field"], {"Average Citations after 5 Years": agg.AVG('Total Ciations After 5 Years'),
"Citations after 5 Years List": agg.CONCAT('Total Ciations After 5 Years')
})
g["Median Citations after 5 Years"] = g["Citations after 5 Years List"].apply(lambda l: np.median(l))
g = g.remove_column("Citations after 5 Years List")
df = g.to_dataframe()
df = df.sort_values(["L1 Field", "Year"])
c = sns.FacetGrid(df, col="L1 Field", sharex=True, sharey=True, col_wrap=5)
c.map(plt.plot, "Year", "Average Citations after 5 Years", alpha=.7,color="g").set_titles("{col_name}")
c = sns.FacetGrid(df, col="L1 Field", sharex=True, sharey=True, col_wrap=5)
c.map(plt.plot, "Year", "Median Citations after 5 Years", alpha=.7,color="g").set_titles("{col_name}")
l2_gen_sframe['Total Ciations After 5 Years'] = l2_gen_sframe.apply(lambda r: citations_after_years(r['Total Citations by Year'],
r["Year"], 5))
g = l2_gen_sframe.groupby(["Year", "L2 Field"], {"Average Citations after 5 Years": agg.AVG('Total Ciations After 5 Years'),
"Citations after 5 Years List": agg.CONCAT('Total Ciations After 5 Years')
})
g["Median Citations after 5 Years"] = g["Citations after 5 Years List"].apply(lambda l: np.median(l))
g = g.remove_column("Citations after 5 Years List")
df = g.to_dataframe()
df = df.sort_values(["L2 Field", "Year"])
c = sns.FacetGrid(df, col="L2 Field", sharex=True, sharey=True, col_wrap=5)
c.map(plt.plot, "Year", "Average Citations after 5 Years", alpha=.7,color="orange").set_titles("{col_name}")
c = sns.FacetGrid(df, col="L2 Field", sharex=True, sharey=True, col_wrap=5)
c.map(plt.plot, "Year", "Median Citations after 5 Years", alpha=.7,color="orange").set_titles("{col_name}")
l3_fields_sf = tc.load_sframe(EXTENDED_PAPERS_SFRAME)["Paper ID", "Paper publish year",
'Fields of study parent list (L3)',
"Ref Number", "Authors Number", "Total Citations by Year"]
l3_fields_sf = l3_fields_sf.rename({"Paper publish year": "Year"})
l3_fields_sf = l3_fields_sf[l3_fields_sf['Fields of study parent list (L3)'] != None]
l3_fields_sf = l3_fields_sf[l3_fields_sf['Fields of study parent list (L3)'].apply(lambda l: len(l) <= 100)] # remove papers that belong to more than 10 fields
l3_fields_sf = l3_fields_sf[l3_fields_sf["Ref Number"] >= 5]
l3_fields_sf.materialize()
l3_fields_sf = filter_sframe_by_years(l3_fields_sf, 2009, 2009)
l3_fields_sf = l3_fields_sf.stack("Fields of study parent list (L3)", new_column_name="L3 Field ID")
l3_fields_sf = l3_fields_sf[l3_fields_sf['Total Citations by Year'] != None]
l3_fields_sf['Citations After 5 Years'] = l3_fields_sf.apply(lambda r: citations_after_years(r['Total Citations by Year'],
r["Year"], 5))
sf = l3_fields_sf["Paper ID","Year", "Authors Number","Citations After 5 Years", "L3 Field ID" ]
g = sf.groupby("L3 Field ID", { "Papers Number": agg.COUNT_DISTINCT("Paper ID"),
"Average Author Number": agg.AVG("Authors Number"),
"Citations After 5-years List": agg.CONCAT("Citations After 5 Years"),
"MAX Citations After 5-years": agg.MAX("Citations After 5 Years")
})
g = g[g["Papers Number"] >= 100] # selecting only L3 fields with at least 100 cited papers in 2009
g["Median Citations After 5-years"] = g["Citations After 5-years List"].apply(lambda l: round(np.median(l),2))
h = tc.load_sframe(FIELDS_OF_STUDY_SFRAME)
g = g.join(h, on={'L3 Field ID': 'Field of study ID'})
h2 = tc.load_sframe(FIELDS_OF_STUDY_HIERARCHY_SFRAME)
h2 = h2[h2['Child field of study level'] == "L3"]
h2 = h2[h2['Parent field of study level'] == "L0"]
h2 = h2[h2['Confidence'] >= 0.95]
g = g.join(h2, on={'L3 Field ID': 'Child field of study ID'})
g = g.join(h, on={"Parent field of study ID": 'Field of study ID' })
g = g.rename({"Field of study name.1": "Parent Field of Study"})
g = g["Parent Field of Study", "Field of study name", "Median Citations After 5-years", "MAX Citations After 5-years", "Papers Number", "Average Author Number" ]
g["MAX Citations After 5-years"] = g["MAX Citations After 5-years"].apply(lambda i: int(i))
g["Average Author Number"] = g["Average Author Number"].apply(lambda i: round(i,2))
g = g.sort(["Parent Field of Study", "Median Citations After 5-years"])
# Save to file
#html = g.to_dataframe().to_html()
#file(u'/mnt/data/fields_stat.html', 'wb').write(html.encode('utf8'))
g
g.sort("Median Citations After 5-years", ascending=True)
plt.subplots(figsize=(20,15))
s = sns.boxplot(y="Parent Field of Study", x="Median Citations After 5-years", data=g.to_dataframe(), orient="h")
s.set(xlabel='L3 Subfields Median Citations After 5-Years')
As in previous results, we can observe that the average/median number of citaitons have considerably increased over all the Biology subfields. However, as can be observed from the above resutls, different research domains have different average and median number of citations.
Overall, in our analysis, we discovered that different domains had a wide range of properties that evolved differently over time.