Created
July 2, 2019 04:54
-
-
Save mostafaasadi/5e29fa74441ddb78e095961ccec4376a to your computer and use it in GitHub Desktop.
قطعه کدی برای بررسی و آنالیز تکرار واژگان در ضربالمثلهای فارسی
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import urllib.request | |
| import arabic_reshaper | |
| from bs4 import BeautifulSoup | |
| from collections import Counter | |
| from bidi.algorithm import get_display | |
| from persian_wordcloud.wordcloud import PersianWordCloud | |
| def get(): | |
| links = list() | |
| with open('links') as f: | |
| for line in f: | |
| links.append(line.replace('\n', '')) | |
| for l in links: | |
| print(l) | |
| html = urllib.request.urlopen(l) | |
| soup = BeautifulSoup(html, 'lxml') | |
| data = soup.findAll('p') | |
| for i in data: | |
| t = str(i) | |
| t = t.split('>') | |
| t = t[2].split('<') | |
| t = t[0].replace('ضرب المثل هاي ايراني', '') | |
| with open('db', 'a') as db: | |
| db.write(t + '\n') | |
| print(t) | |
| def convert(text): | |
| new_text = arabic_reshaper.reshape(text) | |
| bidi_text = get_display(new_text) | |
| return bidi_text | |
| def wc(): | |
| wl = [] | |
| text = '' | |
| swl = [] | |
| with open('stopwords.txt', 'r') as f: | |
| for line in f: | |
| for s in line.split(): | |
| swl.append(str(s)) | |
| stopwords = set(swl) | |
| with open('db', 'r') as f: | |
| for line in f: | |
| for s in line.split(): | |
| if s not in stopwords: | |
| wl.append(str(convert(s.replace('،', '')))) | |
| text = '\n'.join(wl) | |
| wordcloud = PersianWordCloud( | |
| only_persian=True, | |
| max_words=150, | |
| margin=5, | |
| width=800, | |
| height=800, | |
| min_font_size=1, | |
| colormap='Accent', | |
| max_font_size=500, | |
| background_color="white" | |
| ).generate(text) | |
| image = wordcloud.to_image() | |
| image.show() | |
| image.save('result.png') | |
| def count(): | |
| swl = [] | |
| with open('stopwords.txt', 'r') as f: | |
| for line in f: | |
| for s in line.split(): | |
| swl.append(str(s)) | |
| with open('db', 'r') as f: | |
| words = f.read().split() | |
| wordCount = dict(Counter(words)) | |
| wordCount = sorted(wordCount.items(), key=lambda x: x[1]) | |
| for i in wordCount: | |
| if i[0] not in swl: | |
| print(i[0] + ' :: ' + str(i[1])) | |
| def main(): | |
| get() | |
| count() | |
| wc() | |
| if __name__ == '__main__': | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment