Created
December 10, 2015 19:55
-
-
Save pjha1994/05a852430427738f44c0 to your computer and use it in GitHub Desktop.
Revisions
-
pjha1994 created this gist
Dec 10, 2015 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,223 @@ import re from bs4 import BeautifulSoup from bs4 import SoupStrainer import os import httplib2 from datetime import datetime c=0 def make_soup(s): #test_internet() match=re.compile('https://|http://|www.|.com|.in|.org|gov.in') if re.search(match,s): http = httplib2.Http() status, response = http.request(s) page = BeautifulSoup(response,"html.parser",parse_only=SoupStrainer('div'))#,parse_only=SoupStrainer('div') return page else: return None def test_internet(): while(True): try: test_in=make_soup("https://www.google.com") #print('here in inteenet') break except: continue def parse1(s): global c temp_set=set() soup=make_soup(s) if(soup!=None): for div in soup.find_all('div',class_=[ "thing" , "id-t3_3ua12m" ,"linkflair" , "linkflair-normal" , "odd" , "link"]): try: if(div.p!=None and div.p.next_sibling!=None and div.p.next_sibling.next_sibling!=None): x=div.p.next_sibling.next_sibling.next_sibling['class'] #print(x) if(x[0]=='entry'): element='\nPROMPT '+str(c+1)+'\n' if(div.p.next_sibling.next_sibling.next_sibling!=None and div.p.next_sibling.next_sibling.next_sibling.p!=None and div.p.next_sibling.next_sibling.next_sibling.p.a!=None): element=element+div.p.next_sibling.next_sibling.next_sibling.p.a.string+'\n' element=element+div.p.next_sibling.next_sibling.next_sibling.p.a['href']+'\n' if(div.p.next_sibling.next_sibling.next_sibling.find('p',{'class':'tagline'})!=None and div.p.next_sibling.next_sibling.next_sibling.find('p',{'class':'tagline'}).time!=None): element=element+div.p.next_sibling.next_sibling.next_sibling.find('p',{'class':'tagline'}).time['datetime']+'\t' element=element+div.p.next_sibling.next_sibling.next_sibling.find('p',{'class':'tagline'}).time['title']+'\t' element=element+div.p.next_sibling.next_sibling.next_sibling.find('p',{'class':'tagline'}).time.string+'\n' if(div.p.next_sibling.next_sibling.next_sibling.find('p',{'class':'tagline'})!=None and div.p.next_sibling.next_sibling.next_sibling.find('p',{'class':'tagline'}).a!=None): element=element+div.p.next_sibling.next_sibling.next_sibling.find('p',{'class':'tagline'}).a.string+'\n' element=element+div.p.next_sibling.next_sibling.next_sibling.find('p',{'class':'tagline'}).text+'\n' if(div.div.find('div',{'class':'score likes'})!=None): element=element+'score likes '+div.div.find('div',{'class':'score likes'}).string+'\t' element=element+'score dislikes '+div.div.find('div',{'class':'score dislikes'}).string+'\t' element=element+'score unvoted '+div.div.find('div',{'class':'score unvoted'}).string+'\n\n' f.write(element) c=c+1 elif(x[0]=='thumbnail'): element='\nPROMPT '+str(c+1)+'\n' if(div.find('div',{'class':'entry unvoted'})!=None and div.find('div',{'class':'entry unvoted'}).p!=None and div.find('div',{'class':'entry unvoted'}).p.a!=None and div.find('div',{'class':'entry unvoted'}).p.a.string!=None): element=element+div.find('div',{'class':'entry unvoted'}).p.a.string+'\n' element=element+div.find('div',{'class':'entry unvoted'}).p.a['href']+'\n' if(div.find('div',{'class':'entry unvoted'}).find('p',{'class':'tagline'})!=None and div.find('div',{'class':'entry unvoted'}).find('p',{'class':'tagline'}).time != None): element=element+div.find('div',{'class':'entry unvoted'}).find('p',{'class':'tagline'}).time['datetime']+'\t' element=element+div.find('div',{'class':'entry unvoted'}).find('p',{'class':'tagline'}).time['title']+'\t' element=element+div.find('div',{'class':'entry unvoted'}).find('p',{'class':'tagline'}).time.string+'\n' if(div.find('div',{'class':'entry unvoted'}).find('p',{'class':'tagline'}).a!=None): element=element+div.find('div',{'class':'entry unvoted'}).find('p',{'class':'tagline'}).a.string+'\n' element=element+div.find('div',{'class':'entry unvoted'}).find('p',{'class':'tagline'}).text+'\n' if(div.p.next_sibling.next_sibling.find('div',{'class':'score likes'})!=None and div.p.next_sibling.next_sibling.find('div',{'class':'score dislikes'})!=None and div.p.next_sibling.next_sibling.find('div',{'class':'score unvoted'})!=None): element=element+'score likes '+div.p.next_sibling.next_sibling.find('div',{'class':'score likes'}).string+'\t\t' element=element+'score dislikes '+div.p.next_sibling.next_sibling.find('div',{'class':'score dislikes'}).string+'\t\t' element=element+'score unvoted '+div.p.next_sibling.next_sibling.find('div',{'class':'score unvoted'}).string+'\n' f.write(element) c=c+1 except: print('ERROR') continue def count_next_of_current(s,m): test_internet() soup=make_soup(s) y='https://www.reddit.com/r/'+m+'/'+select_tab+'/?count=' match=re.compile(y) for link in soup.find_all('a',{'rel':['next']}): href=link['href'] return href def read_reddit_images(change_file_number,m,x): test_internet() global f global select_tab select_tab=x #x=m+'_'+select_tab+str(change_file_number)+'.txt' x=m+'_'+select_tab+'.txt' f=open(x,'a',encoding='utf-8') FORMAT = '%d-%m-%Y %H:%M:%S' f.write('\n\n\n\niteration number '+str(change_file_number)+' '+datetime.now().strftime(FORMAT)+'\n\n') maximum_number_of_next_pages=7 s='https://www.reddit.com/r/'+m+'/'+select_tab soup=make_soup(s) parse1(s) count=0 print('for '+m+' '+select_tab+' current page number is'+'\n'+str(count)) while(count<maximum_number_of_next_pages): test_internet() s=count_next_of_current(s,m) if(s!=None): parse1(s) count=count+1 print(count) else: break f.write('\n\niteration number '+str(change_file_number)+' '+datetime.now().strftime(FORMAT)+'\n\n') f.close() def maincall(m,i): #test_internet() read_reddit_images(i,m,'hot') #test_internet() read_reddit_images(i,m,'new') #test_internet() read_reddit_images(i,m,'top') #test_internet() read_reddit_images(i,m,'rising') #test_internet() read_reddit_images(i,m,'controversial') #test_internet() read_reddit_images(i,m,'gilded') def subs(b): test_internet() t=open('mytext.txt','r') i=t.read() temp=int(i) temp=temp+1 t.close() t=open('mytext.txt','w') t.write(str(temp)) t.close() for k in b: test_internet() maincall(k,i) def main(): test_internet() #print('here') b=[] b=['24hoursupport','3amjokes','ADHD','AMA','AcademicPhilosophy','AcademicPsychology','Aerospace','Android','AndroidQuestions','Anger','Anxiety', 'AskAnthropology','AskComputerScience','AskElectronics','AskEngineers','AskHR','AskHistorians','AskMen','AskPhysics','AskReddit','AskScienceDiscussion', 'AskScienceFiction','AskSocialScience','AskWomen','Ask_Politics','Bash','BehavioralEconomics','BigDataJobs','BipolarReddit','CAD','C_Programming', 'ComputerScience','Confession','CoverTheWorld','Cplusplus','CppForbeginners','CrappyDesign','CrazyIdeas','DIY','DIYCompSci','DailyProgrammer','DeadBedrooms', 'DebateReligion','DecidingToBeBetter','DigitalNomad','DoesNotTranslate','ECE','Economics','EngineeringStudents','Entrepreneur','ExNoContact','FEA','FE_Exam', 'Feminism','FluidMechanics','Foodforthought','FoundWords','Freethought','GetMotivated','GetStudying','GraphicsProgramming','HITsWorthTurkingFor','HTMLBattles', 'HomeworkHelp','HowsYourJob','IAmA','IOPsychology','InternetIsBeautiful','LaTeX','LanguageLearning','LearnANewLanguage','LearnJava','LearnJavaScript', 'LifeProTips','LinguisticsHumor','LongDistance','MachineLearning','Manufacturing','MathHelp','Meditation','NetworkingJobs','Neuropsychology','NoStupidQuestions', 'ObjectiveC','PCMasterRace','PLC','PhilosophyofScience','PhsychologicalTricks','PoliticalDiscussion','Polyamory','PrintedCircuitBoard','Progether', 'ProgrammerHumor','Proofreading','Python','RapeCounseling','RetailManagement','STEMdents','SWORDS','SWResources','SampleSize','SanctionedSuicide','Seduction', 'SiblingSupport','Statistics','SuicideWatch','Swift','SysadminJobs','TechNews','ThermalPerformance','Tinder','TinyCode','TowerOfBabel','TrueAskReddit', 'TrueReddit','Unix','VentureBiotech','WeMetOnline','Web_Development','WhatsTheWord','YoungJobs','academicpsychology','academicpublishing','accounting','advice', 'androiddev','translator','answers','asklinguistics','askmath','askphotography','askreddit','askscience','assistance','astronomy','audiology','autism','badcode', 'badlinguistics','beermoney','behavioralmedicine','behaviortherapy','bestof','bestofTLDR','bioengineering','biology','biotech','bodybuilding','bookquotes', 'books','breadboard','bugs','buildapc','business','careerguidance','cfd','changemyview','chemicalengineering','chipdesign','civilengineering','cloudcomputing', 'coding','coffeescript','cogneuro','cogneurocogsci','cognitivelinguistics','cogsci','compilers','complexsystems','compling','compression','compsci', 'computerforensics','computers','computerscience','conlangs','conspiracy','construction','cosmology','coursearea','cpp','cpp_questions','crypto','cryptography', 'cs50','csbooks','cscareerquestions','csharp','css','dae','dailyprogrammer','dailyscripts','darkinternet','dataisbeautiful','datamining','dementia','depression', 'diy','documentaries','dotnet','downsyndrome','dyslexia','economics','education','eebooks','electricalengineering','electronics','engineering', 'engineeringtechnology','entrepreneur','epidemiology','etymology','eurodiversity','everythingscience','evolution','evopsych','explainlikeimfive','favors', 'finance','financialindependence','findareddit','forhire','forth','freelance','freelanceUK','freelanceWriters','funny','gadgets','genetics','getdisciplined', 'getemployed','getmotivated','getting_over_it','goldredditsays','grammar','grammarwriting','graphic_design','hacking','hardware','history','holdmybeer', 'homeworkhelp','html','htmlbasics','humanism','hwstartups','hypotheticalsituation','iWantToLearn','ideasfortheadmins','illegaltorrents','improvevocab','india', 'ineedafavor','intel','intelligence','interview','inventions','iwantoutjobs','java','javaTIL','javacodegeeks','javahelp','javascript','jobbit','jobsearchhacks', 'jokes','jquery','languagetechnology','learnjava','learnjavascript','learnmath','learnprogramming','learnpython','lectures','lifehacks','linguistics','linux', 'linux4noobs','linuxquestions','literature','logic','machinelearning','marketing','masculism','math','mathbooks','mathematics','mathpsych','matlab', 'mechanicalengineering','medicine','meditation','mentalhealth','mentors','metalworking','microsoft','mmfb','motivation','movies','music','mysql','needadvice', 'networking','neuro','neurodiversity','neurophilosophy','neuropsychology','newproducts','news','newtoreddit','nonprofit_jobs','nootropics','obvious', 'occupationaltherapy','ocd','offmychest','opengl','osdev','parkrangers','perl','philosophy','philosophyofScience','philosophyofscience','php','physics','pics', 'politics','privacy','product_design','productivity','programbattles','programming','programmingbuddies','programmingchallenges','psychiatry','psychology', 'psychopharmacology','psychotherapy','psychscience','puzzles','python','quotes','rage','rational','reasonstolive','rehabtherapy','relationship_advice', 'relationships','resumes','riddles','robotics','ruby','saneorpsycho','schizophrenia','science','scientificresearch','self','selfhelp','selfimprovement','sex', 'shittyaskscience','shittyideas','shittyprogramming','showerthoughts','simpleliving','slp','socialism','socialmedia','socialskills','sociology','software', 'softwarearchitecture','softwaredevelopment','softwaregore','solotravel','space','specialed','startups','stopselfharm','suicidology','sysadmin','systems', 'talesfromtechsupport','technology','techsupport','teenagers','testimonials','themixednuts','thisismyjob','tipofmytongue','todayilearned','tr', 'translationstudies','travel','tutor','ultralight','undelete','undeleteShadow','undergraduateresearch','uniqueminds','visualbasic','web_programming','webdev', 'whatisthis','whatstheword','windows','windowsazure','womenEngineers','words','work','workonline','worldnews','writingprompts']#major list, Once a week #b=[] #b=['AskAnthropology','AskScienceDiscussion', # 'AskScienceFiction','AskSocialScience','Ask_Politics','ECE','Economics', # 'Freethought', # 'GetMotivated','GetStudying','GraphicsProgramming','Neuropsychology','NoStupidQuestions','PhsychologicalTricks', # 'PoliticalDiscussion','Web_Development','badcode', # 'biology','books','bugs','buildapc','compilers', # 'computers','computerscience','crypto', # 'cryptography','cs50','csbooks','cscareerquestions','dailyprogrammer', # 'dailyscripts','electronics','explainlikeimfive','grammar','hacking', # 'history','linux', # 'linux4noobs','linuxquestions','logic','mysql','networking', # 'opengl','philosophy','philosophyofScience','politics', # 'productivity','programmingchallenges', # 'shittyaskscience','shittyideas','shittyprogramming','showerthoughts', # 'socialism','socialskills','software']#regular list,Everyday #['AskScienceFiction','AskSocialScience','Ask_Politics','ECE','Economics', ## 'GraphicsProgramming' b=[] b=['AskAnthropology','AskScienceDiscussion', 'AskSocialScience', 'Ask_Politics','badcode','biology','compilers', 'computers','computerscience','crypto', 'cryptography','cs50','csbooks', 'cscareerquestions','dailyprogrammer','electronics','history', 'linux','linux4noobs','linuxquestions','logic','psychology']#basic_LONG INTERVAL l=set() for k in b: l.add(k) b=[] for k in l: b.append(k) b.sort() subs(b) #xcv=0 #print('[',sep='',end='') #for k in b: # xcv=xcv+1 # print("'",k,"',",sep='',end='') # #print(str(xcv),k,'\t\t') #print(']') #list_subreddits() if __name__ == '__main__': main()