Created
July 22, 2018 06:41
-
-
Save lopezdp/22f1089f5632a968b58c5564f3c763f7 to your computer and use it in GitHub Desktop.
Daily Resolution for Long Term (> 5 Years) Google Trends Data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #### connect to google | |
| _pytrends = TrendReq(hl='en-US', tz=360) | |
| #### build the playload | |
| _kw_list = ["bitcoin"] | |
| _cat = 0 | |
| _geo = '' | |
| _gprop = '' | |
| # dates can be formated as `2017-12-07 2018-01-07`,`today 3-m`, or `today 5-y` check trends.google.com's url | |
| _date_fmt = '%Y-%m-%d' | |
| # Create and format a new list of datetime objects using list of strings representing dates needed | |
| _start_date, _end_date = map(lambda x : dt.datetime.strptime(x, _date_fmt) | |
| , ['2010-08-29', '2018-07-22']) | |
| ### Build an array of 60d periods to retreive google trend data with a one day resolution | |
| # _60d_periods stores the total amount of 60 day periods there are between start & end date | |
| _60d_periods = math.ceil( (_end_date - _start_date) / dt.timedelta(days=60) ) | |
| # _tmp_range is a list of dates separated by 60d. We need one more than the number of _60_periods. | |
| # if _end_date is in the future google returns the most recent data | |
| _tmp_range = pd.date_range(start= _start_date, periods= _60d_periods + 1, freq= '60D') | |
| # making the list of `_start_date _end_date`, strf separated by a space | |
| # using the _tmp_range values join the 60d periods into a list of 60d periods | |
| _rolling_dates = [ ' '.join(map(lambda x : x.strftime(_date_fmt), | |
| [_tmp_range[i], _tmp_range[i+1] ]) | |
| ) | |
| for i in range(len(_tmp_range)-1) ] | |
| # initialization of the major data frame _df_trends | |
| # _date will contains our last playload argument | |
| _date = _rolling_dates[0] | |
| _pytrends.build_payload(_kw_list, cat=_cat, timeframe=_date, geo=_geo, gprop=_gprop) | |
| _df_trends = _pytrends.interest_over_time() | |
| for _dates in _rolling_dates[1:] : | |
| # we need to normalize data before concatanation | |
| _common_date = _dates.split(' ')[0] | |
| _pytrends.build_payload(_kw_list, cat=_cat, timeframe=_dates, geo=_geo, gprop=_gprop) | |
| _tmp_df = _pytrends.interest_over_time() | |
| _multiplication_factor = _df_trends.loc[_common_date] / _tmp_df.loc[_common_date] | |
| # _df_trends contains the normalized Trends data | |
| _df_trends = (pd.concat([_df_trends, | |
| (_tmp_df[1:] * _multiplication_factor)]) | |
| .drop(labels = 'isPartial', axis = 1) # isPartial usefull ? | |
| .resample('D', closed='right').bfill() # making sure that we have one value per day. | |
| ) | |
| gglTrnd = _df_trends | |
| gglTrnd.mean() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment