Helw150 · May 6, 2024 17:33 · May 6, 2024 · May 6, 2024
diff --git a/process_parses.py b/process_parses.py
@@ -326,9 +326,6 @@ def load_open_ai_cache():
 
 def get_langs_chatgpt(row):
     global response_cache
-    #    openai.api_base = "https://diyi-group-nsf.openai.azure.com/"
-    #    openai.api_type = "azure"
-    #    openai.api_version = "2023-05-15"
     engine = "chatgpt0613"
     if not len(row["lang_mentions_sample"]) > 0:
         row["open_ai_resp"] = str({})

diff --git a/process_parses.py b/process_parses.py
@@ -0,0 +1,435 @@
+import ast
+
+# To Delete After Debug
+import code
+import copyreg
+import datetime
+import functools
+import json
+import os
+import re
+import time
+from ast import literal_eval
+from collections import defaultdict
+from urllib.parse import urlsplit
+
+import gnureadline
+import numpy as np
+import openai
+import pandas as pd
+import ray
+import tiktoken
+from countryguess import guess_country
+from fuzzysearch import find_near_matches
+from nltk.tokenize import sent_tokenize
+from tqdm import tqdm
+
+
+def load_mapping():
+    with open("domains.csv", "r") as f:
+        lines = f.readlines()
+    return {line.split(",")[0].strip(): line.split(",")[1].strip() for line in lines}
+
+
+enc = tiktoken.encoding_for_model("gpt-3.5-turbo")
+
+
+def pickle_Encoding(enc):
+    return (
+        functools.partial(
+            tiktoken.core.Encoding,
+            enc.name,
+            pat_str=enc._pat_str,
+            mergeable_ranks=enc._mergeable_ranks,
+            special_tokens=enc._special_tokens,
+        ),
+        (),
+    )
+
+
+copyreg.pickle(tiktoken.core.Encoding, pickle_Encoding)
+countries = (
+    "("
+    + "|".join(
+        [
+            re.escape(country)
+            for country in open("list-of-countries.txt", "r").read().split("\n")
+        ]
+        + [","]
+    )
+    + ")"
+)
+
+# Manually add Chinese since it's not in the taxonomy
+langs = (
+    r"\b("
+    + "|".join([re.escape(lang) for lang in pd.read_csv("lang2tax.txt").language.array])
+    + "|Chinese"
+    + r")\b"
+)
+lang_pattern = re.compile(langs, flags=re.IGNORECASE)
+uni_db = pd.read_csv("world-universities.csv").astype(str)
+uni_db["uni_website"] = uni_db["uni_website"].map(
+    lambda x: urlsplit(x).netloc.replace("www.", "")
+)
+uni_db = uni_db.set_index(["uni_name"]).sort_index()
+zcdb = pd.read_csv(
+    "allCountries.txt",
+    sep="\t",
+    dtype=str,
+    names=[
+        "country_code",
+        "postal_code",
+        "place_name",
+        "admin1_name",
+        "admin1_code",
+        "admin2_name",
+        "admin2_code",
+        "admin3_name",
+        "admin3_code",
+        "latitude",
+        "longitude",
+        "accuracy",
+    ],
+).set_index(["postal_code"])
+cedex_pattern = r"CEDEX( [0-9])?"
+zcdb.index = zcdb.index.map(str).map(lambda x: re.sub(cedex_pattern, "", x))
+zcdb = zcdb.astype(str).sort_index()
+country_pattern = re.compile(countries, flags=re.IGNORECASE)
+zip_patterns = [
+    r"[0-9]{4,5}",
+    r"(?i)([a-z0-9][a-z0-9\- ]{0,10}[a-z0-9])",
+]  # First Pattern for the most common formats, second captures all global formats in the GeoNames DB
+email_mapping = load_mapping()
+
+
+def get_prefixes(acl_id):
+    prefix = acl_id[0:] if acl_id[0].isalpha() else acl_id[5:]
+    first_prefix = prefix[0].upper()
+    second_prefix = prefix.split("-")[0]
+    return first_prefix, second_prefix
+
+
+def get_json(row):
+    acl_id = row["acl_id"]
+    first_prefix, second_prefix = get_prefixes(acl_id)
+    try:
+        row["json"] = open(
+            f"./Base_JSON/prefix{first_prefix}/json/{second_prefix}/{acl_id}.json", "r"
+        ).read()
+    except:
+        prior = row["url"].split("/")[-3][-2:]
+        latter = row["url"].split("/")[-1].replace(".pdf", "")
+        print(
+            f"./Base_JSON/prefix{first_prefix}/json/{second_prefix.lower()}/l{prior}_{latter}.json"
+        )
+        try:
+            row["json"] = open(
+                f"./Base_JSON/prefix{first_prefix}/json/{second_prefix.lower()}/l{prior}_{latter}.json",
+                "r",
+            ).read()
+        except:
+            row["json"] = None
+    return row
+
+
+def get_locations(author):
+    affiliation = author["affiliation"]
+    if "email" in author and author["email"].split(".")[-1] in email_mapping:
+        return [email_mapping[author["email"].split(".")[-1]]]
+    if "location" in affiliation and "country" in affiliation["location"]:
+        countries = []
+        for country in re.split(
+            country_pattern,
+            affiliation["location"]["country"].replace(" and ", ","),
+        ):
+            guess = guess_country(country)
+            if guess:
+                countries.append(guess["name_short"])
+        if len(countries) > 0:
+            return countries
+    if "location" in affiliation and len(affiliation["location"]) > 0:
+        location = defaultdict(str, affiliation["location"])
+        full_addr = f"{location['addrLine']} {location['settlement']} {location['region']} {location['postCode']}"
+        if "postCode" in location:
+            code_matches = []
+            zip_codes = [
+                match
+                for pattern in zip_patterns
+                for match in re.findall(pattern, full_addr)
+            ]
+            zip_codes = [code for code in zip_codes if code in zcdb.index]
+            zipc = zcdb.loc[zip_codes]
+            for code in zipc.iloc:
+                if (
+                    code.place_name in full_addr
+                    or (code.admin1_code in full_addr and code.admin1_code.isalpha())
+                    or code.admin1_name in full_addr
+                ):
+                    code_matches.append(guess_country(code.country_code)["name_short"])
+            if len(code_matches) > 0:
+                return code_matches
+
+    if "email" in author and author["email"]:
+        author_website = author["email"].split("@")[-1]
+        if author_website in uni_db["uni_website"].array:
+            return [
+                guess_country(
+                    uni_db[uni_db["uni_website"] == author_website].iloc[0][
+                        "country_code"
+                    ]
+                )["name_short"]
+            ]
+        author_tld_only = (
+            ".".join(author_website.split(".")[-2:])
+            if len(author_website.split(".")) > 2
+            else None
+        )
+
+        if author_tld_only and author_tld_only in uni_db["uni_website"].array:
+            return [
+                guess_country(
+                    uni_db[uni_db["uni_website"] == author_tld_only].iloc[0][
+                        "country_code"
+                    ]
+                )["name_short"]
+            ]
+
+        tld_only = uni_db.uni_website.map(lambda x: ".".join(x.split(".")[-2:]))
+        if author_website in tld_only.array:
+            return [
+                guess_country(
+                    uni_db[tld_only == author_website].iloc[0]["country_code"]
+                )["name_short"]
+            ]
+
+    if "institution" in affiliation or "laboratory" in affiliation:
+        lab_and_school = f"{affiliation['laboratory']} {affiliation['institution']}"
+        school_matches = []
+        for uni in uni_db.index.array:
+            cand_match = find_near_matches(
+                uni, lab_and_school, max_l_dist=1, max_substitutions=0, max_deletions=0
+            )
+            if len(cand_match) > 0:
+                match = cand_match[0].matched
+                school_matches.append((match, uni))
+        c = school_matches
+        school_matches = [
+            match[1]
+            for match in school_matches
+            if all(
+                [
+                    (match[0] not in c_match[0] or match[1] == c_match[1])
+                    for c_match in school_matches
+                ]
+            )
+        ]
+
+        if len(school_matches) > 0:
+            countries = []
+            for match in school_matches:
+                country_info = uni_db.loc[match].country_code
+                if type(country_info) == type("test") and country_info != "nan":
+                    countries.append(guess_country(country_info)["name_short"])
+            if len(countries) > 0:
+                return countries
+
+    return []
+
+
+def get_institutions(row):
+    row["countries"] = set(
+        filter(
+            lambda x: str(x) == x,
+            [
+                location
+                for author in json.loads(row["json"])["authors"]
+                for location in get_locations(author)
+            ],
+        )
+    )
+    return row
+
+
+def add_json(rdf):
+    return rdf.map(get_json).filter(lambda row: row["json"] != None)
+
+
+def dedupe_country(row):
+    countries = row["countries"]
+    normalized_countries = []
+    for country in countries:
+        if country == "Europe":
+            normalized_countries.append(country)
+        else:
+            normalized = guess_country(country)
+            normalized_countries.append(normalized["name_short"])
+    row["countries"] = set(normalized_countries)
+    return row
+
+
+def add_country(rdf):
+    return (
+        rdf.map(get_institutions)
+        .filter(lambda row: len(row["countries"]) > 0)
+        .map(dedupe_country)
+    )
+
+
+def check_lang(row):
+    row["langs"] = [
+        match.lower().capitalize()
+        for match in re.findall(lang_pattern, str(row["full_text"]))
+    ]
+    return row
+
+
+def re_list(row):
+    if "countries" in row:
+        row["countries"] = row["countries"].split("[SEP]")
+    if "langs" in row:
+        row["langs"] = row["langs"].split("[SEP]")
+    return row
+
+
+sample_5 = functools.partial(np.random.choice, size=5, replace=False)
+sample = lambda x: sample_5(x) if len(x) >= 5 else x
+
+
+def get_lang_sents(row):
+    row["langs"] = [lang for lang in row["langs"] if len(lang) > 0]
+    sents = [
+        sent
+        for sent in sent_tokenize(row["full_text"])
+        if any([lang in sent for lang in row["langs"]])
+        if len(sent) < 1000
+    ]
+    row["lang_mentions"] = sents
+    row["lang_mentions_sample"] = list(sample(sents))
+    row["tok_len"] = len(enc.encode(str(row["lang_mentions_sample"])))
+    return row
+
+
+def load_open_ai_cache():
+    with open("raw_openai_resp_cache", "r") as f:
+        raw = f.read()
+    entries = [
+        line.replace("[BEG_WILL]", "").split("[MID_WILL]")
+        for line in raw.split("[END_WILL]")
+    ]
+    c = {entry[0]: json.loads(entry[1]) for entry in entries if len(entry) == 2}
+    return c
+
+
+response_cache = load_open_ai_cache()
+
+
+def get_langs_chatgpt(row):
+    global response_cache
+    #    openai.api_base = "https://diyi-group-nsf.openai.azure.com/"
+    #    openai.api_type = "azure"
+    #    openai.api_version = "2023-05-15"
+    engine = "chatgpt0613"
+    if not len(row["lang_mentions_sample"]) > 0:
+        row["open_ai_resp"] = str({})
+        return row
+    elif row["acl_id"] in response_cache:
+        row["open_ai_resp"] = str(response_cache[row["acl_id"]])
+        return row
+    else:
+        row["open_ai_resp"] = str({})
+        return row
+    # time.sleep(0.1)
+    # input_msgs = [
+    #     {
+    #         "role": "system",
+    #         "content": "You are a Natural Language Processing expert carefully studying papers from ACL. On each line, only return valid Python set.",
+    #     },
+    #     {
+    #         "role": "user",
+    #         "content": str(
+    #             'What are the primary languages of interest from this set of a paper with these sentences? Ignore languages that are only mentioned in passing, for example mentions like "Unlike English": should not lead to English being included in the set. \n Sentences: '
+    #             + "\n".join(row["lang_mentions_sample"])
+    #         ),
+    #     },
+    # ]
+    # row["open_ai_resp"] = str(
+    #     openai.ChatCompletion.create(
+    #         engine=engine, messages=input_msgs, temperature=0, stop="\n"
+    #     )
+    # )
+    # file1 = open("raw_openai_resp_cache", "a")  # append mode
+    # file1.write(f'[BEG_WILL]{row["acl_id"]}[MID_WILL]{row["open_ai_resp"]}[END_WILL]')
+    # file1.close()
+    # return row
+
+
+def filter_using_openai(row):
+    final_langs = []
+    for lang in set(row["langs"]):
+        # 'Chinese' is linguistically ambiguous, but pervasive
+        # Map to Mandarin here as that is the most frequent sense
+        if lang == "Mandarin" and "Chinese" in row["open_ai_resp"]:
+            final_langs.append(lang)
+        if lang in row["open_ai_resp"]:
+            final_langs.append(lang)
+    row["final_langs"] = final_langs
+    return row
+
+
+if __name__ == "__main__":
+    import pickle as pkl
+
+    if not os.path.isfile("cache"):
+        df = pkl.load(
+            open(
+                "acl-publication-info.74k.v3.full-sections-partial-topic-labels.pkl",
+                "rb",
+            )
+        )
+        rdf = ray.data.from_pandas(df).repartition(num_blocks=32)
+        ray.data.DataContext.get_current().execution_options.verbose_progress = True
+        rdf = add_json(rdf)
+        rdf = add_country(rdf)
+        with open("cache", "wb") as f:
+            pkl.dump(rdf.to_pandas(), f)
+    with open("cache", "rb") as f:
+        df = pkl.load(f)
+        df["countries"] = df["countries"].apply("[SEP]".join)
+        df = df.astype(str)
+        rdf = ray.data.from_pandas(df).repartition(num_blocks=32)
+        rdf = rdf.map(re_list)
+
+    if not os.path.isfile("lang_cache"):
+        rdf = rdf.map(check_lang)
+        with open("lang_cache", "wb") as f:
+            pkl.dump(rdf.to_pandas(), f)
+    with open("lang_cache", "rb") as f:
+        df = pkl.load(f)
+        df["countries"] = df["countries"].apply("[SEP]".join)
+        df["langs"] = df["langs"].apply("[SEP]".join)
+        df = df.astype(str)
+        rdf = ray.data.from_pandas(df).repartition(num_blocks=32)
+        rdf = rdf.map(re_list)
+
+    if not os.path.isfile("open_ai_cache"):
+        rdf = rdf.map(get_lang_sents).map(
+            get_langs_chatgpt, compute=ray.data.ActorPoolStrategy(size=2)
+        )
+        df = rdf.to_pandas()
+        with open("open_ai_cache", "wb") as f:
+            pkl.dump(df, f)
+
+    with open("open_ai_cache", "rb") as f:
+        df = pkl.load(f)
+        df["countries"] = df["countries"].apply("[SEP]".join)
+        df["langs"] = df["langs"].apply("[SEP]".join)
+        df = df.astype(str)
+        df["open_ai_resp"] = df.open_ai_resp.apply(ast.literal_eval).apply(
+            lambda x: x["choices"][0]["message"]["content"] if "choices" in x else ""
+        )
+        rdf = ray.data.from_pandas(df).repartition(num_blocks=32)
+        rdf = rdf.map(re_list).map(filter_using_openai)
+        df = rdf.to_pandas()
+
+    code.InteractiveConsole(locals=globals()).interact()
No results found