Skip to content

Instantly share code, notes, and snippets.

@Helw150
Last active May 6, 2024 17:33
Show Gist options
  • Select an option

  • Save Helw150/fc1460eca13e4c1657aa86b5ca84d5cc to your computer and use it in GitHub Desktop.

Select an option

Save Helw150/fc1460eca13e4c1657aa86b5ca84d5cc to your computer and use it in GitHub Desktop.

Revisions

  1. Helw150 revised this gist May 6, 2024. 1 changed file with 0 additions and 3 deletions.
    3 changes: 0 additions & 3 deletions process_parses.py
    Original file line number Diff line number Diff line change
    @@ -326,9 +326,6 @@ def load_open_ai_cache():

    def get_langs_chatgpt(row):
    global response_cache
    # openai.api_base = "https://diyi-group-nsf.openai.azure.com/"
    # openai.api_type = "azure"
    # openai.api_version = "2023-05-15"
    engine = "chatgpt0613"
    if not len(row["lang_mentions_sample"]) > 0:
    row["open_ai_resp"] = str({})
  2. Helw150 created this gist May 6, 2024.
    435 changes: 435 additions & 0 deletions process_parses.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,435 @@
    import ast

    # To Delete After Debug
    import code
    import copyreg
    import datetime
    import functools
    import json
    import os
    import re
    import time
    from ast import literal_eval
    from collections import defaultdict
    from urllib.parse import urlsplit

    import gnureadline
    import numpy as np
    import openai
    import pandas as pd
    import ray
    import tiktoken
    from countryguess import guess_country
    from fuzzysearch import find_near_matches
    from nltk.tokenize import sent_tokenize
    from tqdm import tqdm


    def load_mapping():
    with open("domains.csv", "r") as f:
    lines = f.readlines()
    return {line.split(",")[0].strip(): line.split(",")[1].strip() for line in lines}


    enc = tiktoken.encoding_for_model("gpt-3.5-turbo")


    def pickle_Encoding(enc):
    return (
    functools.partial(
    tiktoken.core.Encoding,
    enc.name,
    pat_str=enc._pat_str,
    mergeable_ranks=enc._mergeable_ranks,
    special_tokens=enc._special_tokens,
    ),
    (),
    )


    copyreg.pickle(tiktoken.core.Encoding, pickle_Encoding)
    countries = (
    "("
    + "|".join(
    [
    re.escape(country)
    for country in open("list-of-countries.txt", "r").read().split("\n")
    ]
    + [","]
    )
    + ")"
    )

    # Manually add Chinese since it's not in the taxonomy
    langs = (
    r"\b("
    + "|".join([re.escape(lang) for lang in pd.read_csv("lang2tax.txt").language.array])
    + "|Chinese"
    + r")\b"
    )
    lang_pattern = re.compile(langs, flags=re.IGNORECASE)
    uni_db = pd.read_csv("world-universities.csv").astype(str)
    uni_db["uni_website"] = uni_db["uni_website"].map(
    lambda x: urlsplit(x).netloc.replace("www.", "")
    )
    uni_db = uni_db.set_index(["uni_name"]).sort_index()
    zcdb = pd.read_csv(
    "allCountries.txt",
    sep="\t",
    dtype=str,
    names=[
    "country_code",
    "postal_code",
    "place_name",
    "admin1_name",
    "admin1_code",
    "admin2_name",
    "admin2_code",
    "admin3_name",
    "admin3_code",
    "latitude",
    "longitude",
    "accuracy",
    ],
    ).set_index(["postal_code"])
    cedex_pattern = r"CEDEX( [0-9])?"
    zcdb.index = zcdb.index.map(str).map(lambda x: re.sub(cedex_pattern, "", x))
    zcdb = zcdb.astype(str).sort_index()
    country_pattern = re.compile(countries, flags=re.IGNORECASE)
    zip_patterns = [
    r"[0-9]{4,5}",
    r"(?i)([a-z0-9][a-z0-9\- ]{0,10}[a-z0-9])",
    ] # First Pattern for the most common formats, second captures all global formats in the GeoNames DB
    email_mapping = load_mapping()


    def get_prefixes(acl_id):
    prefix = acl_id[0:] if acl_id[0].isalpha() else acl_id[5:]
    first_prefix = prefix[0].upper()
    second_prefix = prefix.split("-")[0]
    return first_prefix, second_prefix


    def get_json(row):
    acl_id = row["acl_id"]
    first_prefix, second_prefix = get_prefixes(acl_id)
    try:
    row["json"] = open(
    f"./Base_JSON/prefix{first_prefix}/json/{second_prefix}/{acl_id}.json", "r"
    ).read()
    except:
    prior = row["url"].split("/")[-3][-2:]
    latter = row["url"].split("/")[-1].replace(".pdf", "")
    print(
    f"./Base_JSON/prefix{first_prefix}/json/{second_prefix.lower()}/l{prior}_{latter}.json"
    )
    try:
    row["json"] = open(
    f"./Base_JSON/prefix{first_prefix}/json/{second_prefix.lower()}/l{prior}_{latter}.json",
    "r",
    ).read()
    except:
    row["json"] = None
    return row


    def get_locations(author):
    affiliation = author["affiliation"]
    if "email" in author and author["email"].split(".")[-1] in email_mapping:
    return [email_mapping[author["email"].split(".")[-1]]]
    if "location" in affiliation and "country" in affiliation["location"]:
    countries = []
    for country in re.split(
    country_pattern,
    affiliation["location"]["country"].replace(" and ", ","),
    ):
    guess = guess_country(country)
    if guess:
    countries.append(guess["name_short"])
    if len(countries) > 0:
    return countries
    if "location" in affiliation and len(affiliation["location"]) > 0:
    location = defaultdict(str, affiliation["location"])
    full_addr = f"{location['addrLine']} {location['settlement']} {location['region']} {location['postCode']}"
    if "postCode" in location:
    code_matches = []
    zip_codes = [
    match
    for pattern in zip_patterns
    for match in re.findall(pattern, full_addr)
    ]
    zip_codes = [code for code in zip_codes if code in zcdb.index]
    zipc = zcdb.loc[zip_codes]
    for code in zipc.iloc:
    if (
    code.place_name in full_addr
    or (code.admin1_code in full_addr and code.admin1_code.isalpha())
    or code.admin1_name in full_addr
    ):
    code_matches.append(guess_country(code.country_code)["name_short"])
    if len(code_matches) > 0:
    return code_matches

    if "email" in author and author["email"]:
    author_website = author["email"].split("@")[-1]
    if author_website in uni_db["uni_website"].array:
    return [
    guess_country(
    uni_db[uni_db["uni_website"] == author_website].iloc[0][
    "country_code"
    ]
    )["name_short"]
    ]
    author_tld_only = (
    ".".join(author_website.split(".")[-2:])
    if len(author_website.split(".")) > 2
    else None
    )

    if author_tld_only and author_tld_only in uni_db["uni_website"].array:
    return [
    guess_country(
    uni_db[uni_db["uni_website"] == author_tld_only].iloc[0][
    "country_code"
    ]
    )["name_short"]
    ]

    tld_only = uni_db.uni_website.map(lambda x: ".".join(x.split(".")[-2:]))
    if author_website in tld_only.array:
    return [
    guess_country(
    uni_db[tld_only == author_website].iloc[0]["country_code"]
    )["name_short"]
    ]

    if "institution" in affiliation or "laboratory" in affiliation:
    lab_and_school = f"{affiliation['laboratory']} {affiliation['institution']}"
    school_matches = []
    for uni in uni_db.index.array:
    cand_match = find_near_matches(
    uni, lab_and_school, max_l_dist=1, max_substitutions=0, max_deletions=0
    )
    if len(cand_match) > 0:
    match = cand_match[0].matched
    school_matches.append((match, uni))
    c = school_matches
    school_matches = [
    match[1]
    for match in school_matches
    if all(
    [
    (match[0] not in c_match[0] or match[1] == c_match[1])
    for c_match in school_matches
    ]
    )
    ]

    if len(school_matches) > 0:
    countries = []
    for match in school_matches:
    country_info = uni_db.loc[match].country_code
    if type(country_info) == type("test") and country_info != "nan":
    countries.append(guess_country(country_info)["name_short"])
    if len(countries) > 0:
    return countries

    return []


    def get_institutions(row):
    row["countries"] = set(
    filter(
    lambda x: str(x) == x,
    [
    location
    for author in json.loads(row["json"])["authors"]
    for location in get_locations(author)
    ],
    )
    )
    return row


    def add_json(rdf):
    return rdf.map(get_json).filter(lambda row: row["json"] != None)


    def dedupe_country(row):
    countries = row["countries"]
    normalized_countries = []
    for country in countries:
    if country == "Europe":
    normalized_countries.append(country)
    else:
    normalized = guess_country(country)
    normalized_countries.append(normalized["name_short"])
    row["countries"] = set(normalized_countries)
    return row


    def add_country(rdf):
    return (
    rdf.map(get_institutions)
    .filter(lambda row: len(row["countries"]) > 0)
    .map(dedupe_country)
    )


    def check_lang(row):
    row["langs"] = [
    match.lower().capitalize()
    for match in re.findall(lang_pattern, str(row["full_text"]))
    ]
    return row


    def re_list(row):
    if "countries" in row:
    row["countries"] = row["countries"].split("[SEP]")
    if "langs" in row:
    row["langs"] = row["langs"].split("[SEP]")
    return row


    sample_5 = functools.partial(np.random.choice, size=5, replace=False)
    sample = lambda x: sample_5(x) if len(x) >= 5 else x


    def get_lang_sents(row):
    row["langs"] = [lang for lang in row["langs"] if len(lang) > 0]
    sents = [
    sent
    for sent in sent_tokenize(row["full_text"])
    if any([lang in sent for lang in row["langs"]])
    if len(sent) < 1000
    ]
    row["lang_mentions"] = sents
    row["lang_mentions_sample"] = list(sample(sents))
    row["tok_len"] = len(enc.encode(str(row["lang_mentions_sample"])))
    return row


    def load_open_ai_cache():
    with open("raw_openai_resp_cache", "r") as f:
    raw = f.read()
    entries = [
    line.replace("[BEG_WILL]", "").split("[MID_WILL]")
    for line in raw.split("[END_WILL]")
    ]
    c = {entry[0]: json.loads(entry[1]) for entry in entries if len(entry) == 2}
    return c


    response_cache = load_open_ai_cache()


    def get_langs_chatgpt(row):
    global response_cache
    # openai.api_base = "https://diyi-group-nsf.openai.azure.com/"
    # openai.api_type = "azure"
    # openai.api_version = "2023-05-15"
    engine = "chatgpt0613"
    if not len(row["lang_mentions_sample"]) > 0:
    row["open_ai_resp"] = str({})
    return row
    elif row["acl_id"] in response_cache:
    row["open_ai_resp"] = str(response_cache[row["acl_id"]])
    return row
    else:
    row["open_ai_resp"] = str({})
    return row
    # time.sleep(0.1)
    # input_msgs = [
    # {
    # "role": "system",
    # "content": "You are a Natural Language Processing expert carefully studying papers from ACL. On each line, only return valid Python set.",
    # },
    # {
    # "role": "user",
    # "content": str(
    # 'What are the primary languages of interest from this set of a paper with these sentences? Ignore languages that are only mentioned in passing, for example mentions like "Unlike English": should not lead to English being included in the set. \n Sentences: '
    # + "\n".join(row["lang_mentions_sample"])
    # ),
    # },
    # ]
    # row["open_ai_resp"] = str(
    # openai.ChatCompletion.create(
    # engine=engine, messages=input_msgs, temperature=0, stop="\n"
    # )
    # )
    # file1 = open("raw_openai_resp_cache", "a") # append mode
    # file1.write(f'[BEG_WILL]{row["acl_id"]}[MID_WILL]{row["open_ai_resp"]}[END_WILL]')
    # file1.close()
    # return row


    def filter_using_openai(row):
    final_langs = []
    for lang in set(row["langs"]):
    # 'Chinese' is linguistically ambiguous, but pervasive
    # Map to Mandarin here as that is the most frequent sense
    if lang == "Mandarin" and "Chinese" in row["open_ai_resp"]:
    final_langs.append(lang)
    if lang in row["open_ai_resp"]:
    final_langs.append(lang)
    row["final_langs"] = final_langs
    return row


    if __name__ == "__main__":
    import pickle as pkl

    if not os.path.isfile("cache"):
    df = pkl.load(
    open(
    "acl-publication-info.74k.v3.full-sections-partial-topic-labels.pkl",
    "rb",
    )
    )
    rdf = ray.data.from_pandas(df).repartition(num_blocks=32)
    ray.data.DataContext.get_current().execution_options.verbose_progress = True
    rdf = add_json(rdf)
    rdf = add_country(rdf)
    with open("cache", "wb") as f:
    pkl.dump(rdf.to_pandas(), f)
    with open("cache", "rb") as f:
    df = pkl.load(f)
    df["countries"] = df["countries"].apply("[SEP]".join)
    df = df.astype(str)
    rdf = ray.data.from_pandas(df).repartition(num_blocks=32)
    rdf = rdf.map(re_list)

    if not os.path.isfile("lang_cache"):
    rdf = rdf.map(check_lang)
    with open("lang_cache", "wb") as f:
    pkl.dump(rdf.to_pandas(), f)
    with open("lang_cache", "rb") as f:
    df = pkl.load(f)
    df["countries"] = df["countries"].apply("[SEP]".join)
    df["langs"] = df["langs"].apply("[SEP]".join)
    df = df.astype(str)
    rdf = ray.data.from_pandas(df).repartition(num_blocks=32)
    rdf = rdf.map(re_list)

    if not os.path.isfile("open_ai_cache"):
    rdf = rdf.map(get_lang_sents).map(
    get_langs_chatgpt, compute=ray.data.ActorPoolStrategy(size=2)
    )
    df = rdf.to_pandas()
    with open("open_ai_cache", "wb") as f:
    pkl.dump(df, f)

    with open("open_ai_cache", "rb") as f:
    df = pkl.load(f)
    df["countries"] = df["countries"].apply("[SEP]".join)
    df["langs"] = df["langs"].apply("[SEP]".join)
    df = df.astype(str)
    df["open_ai_resp"] = df.open_ai_resp.apply(ast.literal_eval).apply(
    lambda x: x["choices"][0]["message"]["content"] if "choices" in x else ""
    )
    rdf = ray.data.from_pandas(df).repartition(num_blocks=32)
    rdf = rdf.map(re_list).map(filter_using_openai)
    df = rdf.to_pandas()

    code.InteractiveConsole(locals=globals()).interact()