import requests, re, os
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from unidecode import unidecode

subreddit = 'Romania'
output_dir = os.getcwd()
output_filename = 'RomaniaWordCloud'


titles = []
done = False
start_date = datetime(2021, 1, 1)
end_date = datetime(2022, 1, 1)
days_to_increment = 5
after_date = start_date
before_date = start_date + timedelta(days_to_increment)


while not done:
    after = after_date.strftime('%Y-%m-%d')
    before = before_date.strftime('%Y-%m-%d')
    url = 'https://api.pushshift.io/reddit/submission/search?sort=desc&limit=10000&subreddit=' + subreddit + '&after=' + after + '&before=' + before
    response = requests.get(url).json();
    for item in response['data']:
        title = unidecode(item['title'])
        if title not in titles:
            titles.append(title)
    print('Got titles between ' + after + ' and ' + before + '. Reached ' + str(len(titles)) + ' titles')
    if before_date >= end_date:
        done = True
    else:
        after_date += timedelta(days_to_increment)
        if before_date + timedelta(days_to_increment) >= end_date:
            before_date = end_date
        else:
            before_date += timedelta(days_to_increment)


print('Got a total of ' + str(len(titles)) + ' titles')


words = ''

for title in titles:
    words_in_title = title.split()
    substring = ' '.join(map(str, words_in_title))
    substring = re.sub(r'[^\w\s\-]', '', substring)
    words += substring + ' '

# conjunctii, anomalii etc
exclusions = [
    'fost',
    'cine',
    'au',
    'ca',
    'că',
    'căci',
    'când',
    'cât',
    'cum',
    'dacă',
    'dar',
    'darămite',
    'de',
    'decât',
    'deci',
    'deoarece',
    'deși',
    'fără',
    'fie',
    'fiindcă',
    'iar',
    'isi',
    'încât',
    'însă',
    'întrucât',
    'necum',
    'nici',
    'numai',
    'or',
    'ori',
    'până',
    'uri',
    'F',
    'lui',
    'precum',
    'sau',
    'să',
    'și',
    'totuși',
    'unde',
    'pe',
    'la',
    'spre',
    'cu',
    'de',
    'fără',
    'sub',
    'în',
    'prin',
    'pentru',
    'către',
    'contra',
    'lângă',
    'ce',
    'si',
    'sa',
    'ma',
    'fi',
    'in',
    'din',
    'care',
    'se',
    'de',
    'va',
    'fi',
    'ati',
    'imi',
    'asa',
    'mi',
    'un',
    'nu',
    'da',
    'mai',
    'ati',
    'ce',
    'cat',
    'o',
    'e',
    'la',
    'ne',
    'pe',
    'au',
    'mi',
    'al',
    'cel',
    'vs',
    'își',
    'pt',
    'new',
    'home',
    'ati',
    'asa',
    'ai',
    'te',
    'le',
    'o',
    'mi',
    'ul',
    'n',
    'm',
    'l',
    'il',
    'ii'
]

word_list = words.split(' ')
curated_words = ''
for word in word_list:
    if word.lower() not in exclusions:
        curated_words += ' ' + word

print('Plotting...')

word_cloud = WordCloud(width=2048, height=1024).generate(curated_words)
plt.figure(figsize=(20,10))
plt.imshow(word_cloud, interpolation="bilinear")
plt.axis("off")
plt.savefig(output_dir + '/' + output_filename + '.png', dpi=200)
print('Done!')