Skip to content

Instantly share code, notes, and snippets.

@octoparse
Last active May 9, 2019 01:24
Show Gist options
  • Select an option

  • Save octoparse/3abc6771a87e49e34c9fa18f2ed7d91e to your computer and use it in GitHub Desktop.

Select an option

Save octoparse/3abc6771a87e49e34c9fa18f2ed7d91e to your computer and use it in GitHub Desktop.

Revisions

  1. octoparse renamed this gist Apr 26, 2019. 1 changed file with 0 additions and 0 deletions.
  2. octoparse renamed this gist Apr 26, 2019. 1 changed file with 0 additions and 0 deletions.
    File renamed without changes.
  3. octoparse created this gist Apr 26, 2019.
    103 changes: 103 additions & 0 deletions gistfile1.txt
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,103 @@
    import collections
    import re


    def get_first_name(aString):
    if not aString:
    return aString
    ss = aString.replace('*', '').split(' ') # ['Leonard', 'Nimoy*Chris', 'PineZachary', 'QuintoZoe', 'SaldanaKarl']
    name_list = [] # result returned for this function
    for name in ss:
    names = re.findall('([A-Z])', name) #[N, C]
    if len(names) == 1: # L
    name_list.append(name)
    elif len(names) == 2:
    name_list.append(name[name.find(names[1],1):]) #name.find(names['C']) = 5 --> name[5:]
    return name_list[0:-1] # array

    def read_marvel(file_name):
    movies = []
    with open(file_name) as f:
    movies = f.read().split('\n')
    return movies

    def read_csv(file_name): # file_name = 'boxoffice.csv'
    movie_list = [] # create an empty list
    with open(file_name) as f:
    txt = f.read()
    row_list = txt.split('\n') # a list of each row of data
    for row in row_list:
    name_list = row.split(',')
    if len(name_list) >= 3:
    name_list[1] = get_first_name(name_list[1])
    # 0: movie name, 1: actor names, 2: year
    movie_list.append((name_list[0], name_list[1], name_list[2]))
    return movie_list

    def lookup_gender(filename):
    gender_dict = {}
    with open(filename) as f:
    firstname_gender = f.read().split('\n')
    for t in firstname_gender[:-1]: # there is a '' at the end cuz rows were split by \n
    firstname, gender = t.split(',')
    gender_dict[firstname] = gender
    return gender_dict

    ## SCRIPT begins
    # construct a first name to gender dictionary
    gender_dict = lookup_gender('name.csv')

    movie_list = read_csv('boxoffice.csv') # movie_list has 3 columns: name, actor list, year
    all_movie_dict = collections.OrderedDict()
    sorted_movie_list = sorted(movie_list, key=lambda x:x[2], reverse=True)

    ##all_actors = set()
    ##for m in sorted_movie_list:
    ## for n in m[1]:
    ## all_actors.add(n)
    ##
    ##with open('raw_name.csv','w') as f:
    ## for name in all_actors:
    ## f.write(name + '\n')


    ## Analysze all movie's actor gender by year
    all_year_dict = collections.OrderedDict()
    for m in sorted_movie_list:
    all_movie_dict[m[0]] = [m[1], m[2]]
    year = m[2]
    actors = m[1]
    try:
    genders = [gender_dict[name] for name in actors]
    except KeyError:
    pass
    if year in all_year_dict:
    all_year_dict[year] += genders
    else:
    all_year_dict[year] = genders

    print ' key: year, value: list of gender of male/female'
    for i in all_year_dict:
    my_temp_dict = {j:all_year_dict[i].count(j) for j in all_year_dict[i]}
    print i, my_temp_dict


    ## Analyze Marvel's movie actor gender by year
    # construct a dict, key: year, value: list of gender of male/female
    year_dict = collections.OrderedDict()
    for marvel_movie in read_marvel('marvel_movies.txt'): # m as movie name
    # for each name in the marvel movie
    year = str(all_movie_dict[marvel_movie][1])
    actors = all_movie_dict[marvel_movie][0]
    gender = [gender_dict[name] for name in actors]
    if year in year_dict:
    year_dict[year] += gender
    else:
    year_dict[year] = gender

    # construct a dict, key: male/female, value: count
    print 'key: male/female, value: count'
    for i in year_dict:
    my_dict = {j:year_dict[i].count(j) for j in year_dict[i]}
    print i, my_dict