Last active
May 9, 2019 01:24
-
-
Save octoparse/3abc6771a87e49e34c9fa18f2ed7d91e to your computer and use it in GitHub Desktop.
Revisions
-
octoparse renamed this gist
Apr 26, 2019 . 1 changed file with 0 additions and 0 deletions.There are no files selected for viewing
File renamed without changes. -
octoparse renamed this gist
Apr 26, 2019 . 1 changed file with 0 additions and 0 deletions.There are no files selected for viewing
File renamed without changes. -
octoparse created this gist
Apr 26, 2019 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,103 @@ import collections import re def get_first_name(aString): if not aString: return aString ss = aString.replace('*', '').split(' ') # ['Leonard', 'Nimoy*Chris', 'PineZachary', 'QuintoZoe', 'SaldanaKarl'] name_list = [] # result returned for this function for name in ss: names = re.findall('([A-Z])', name) #[N, C] if len(names) == 1: # L name_list.append(name) elif len(names) == 2: name_list.append(name[name.find(names[1],1):]) #name.find(names['C']) = 5 --> name[5:] return name_list[0:-1] # array def read_marvel(file_name): movies = [] with open(file_name) as f: movies = f.read().split('\n') return movies def read_csv(file_name): # file_name = 'boxoffice.csv' movie_list = [] # create an empty list with open(file_name) as f: txt = f.read() row_list = txt.split('\n') # a list of each row of data for row in row_list: name_list = row.split(',') if len(name_list) >= 3: name_list[1] = get_first_name(name_list[1]) # 0: movie name, 1: actor names, 2: year movie_list.append((name_list[0], name_list[1], name_list[2])) return movie_list def lookup_gender(filename): gender_dict = {} with open(filename) as f: firstname_gender = f.read().split('\n') for t in firstname_gender[:-1]: # there is a '' at the end cuz rows were split by \n firstname, gender = t.split(',') gender_dict[firstname] = gender return gender_dict ## SCRIPT begins # construct a first name to gender dictionary gender_dict = lookup_gender('name.csv') movie_list = read_csv('boxoffice.csv') # movie_list has 3 columns: name, actor list, year all_movie_dict = collections.OrderedDict() sorted_movie_list = sorted(movie_list, key=lambda x:x[2], reverse=True) ##all_actors = set() ##for m in sorted_movie_list: ## for n in m[1]: ## all_actors.add(n) ## ##with open('raw_name.csv','w') as f: ## for name in all_actors: ## f.write(name + '\n') ## Analysze all movie's actor gender by year all_year_dict = collections.OrderedDict() for m in sorted_movie_list: all_movie_dict[m[0]] = [m[1], m[2]] year = m[2] actors = m[1] try: genders = [gender_dict[name] for name in actors] except KeyError: pass if year in all_year_dict: all_year_dict[year] += genders else: all_year_dict[year] = genders print ' key: year, value: list of gender of male/female' for i in all_year_dict: my_temp_dict = {j:all_year_dict[i].count(j) for j in all_year_dict[i]} print i, my_temp_dict ## Analyze Marvel's movie actor gender by year # construct a dict, key: year, value: list of gender of male/female year_dict = collections.OrderedDict() for marvel_movie in read_marvel('marvel_movies.txt'): # m as movie name # for each name in the marvel movie year = str(all_movie_dict[marvel_movie][1]) actors = all_movie_dict[marvel_movie][0] gender = [gender_dict[name] for name in actors] if year in year_dict: year_dict[year] += gender else: year_dict[year] = gender # construct a dict, key: male/female, value: count print 'key: male/female, value: count' for i in year_dict: my_dict = {j:year_dict[i].count(j) for j in year_dict[i]} print i, my_dict