from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By from webdriver_manager.chrome import ChromeDriverManager driver = webdriver.Chrome(service=Service(ChromeDriverManager().install())) linkstosearch = [ 'https://www.euronews.com/tag/solar-energy', 'https://www.pv-magazine.com/category/opinion/', 'https://www.theguardian.com/environment/solarpower' ] sndround = [] keyword = "solar" reslinks = [] def findElsOnPage(link): ldomain = link[:16] #check if the found link includes the parent link, avoids going social media linklist = [] #links found related to parent link driver.get(link) #go to link driver.implicitly_wait(0.5) #wait to reduce the load articles = driver.find_elements(By.TAG_NAME,"a") for links in articles: try: link_href = links.get_attribute("href") if((link_href is not None)and(ldomain in link_href) and (keyword in link_href) and ("#" not in link_href) and (link_href not in linklist) and(link_href not in reslinks)): #print(link_href) if(("?"in link_href) and (("page" in link_href)or("p=" in link_href))): if("7" in link_href): break if(link_href not in sndround): print(link_href) sndround.append(link_href) elif("tag" in link_href): continue else: linklist.append(link_href) except: continue return linklist for link in linkstosearch: reslinks.extend(findElsOnPage(link)) for link in sndround: reslinks.extend(findElsOnPage(link)) driver.quit() print("Mission complete! \nCheck souplinks.txt") with open('souplinks.txt', 'w') as file: # Write each element to a new line in the file for link in reslinks: file.write(link + '\n')