#!/usr/bin/python -tt
import selenium
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support.ui import Select
from webdriver_manager.chrome import ChromeDriverManager
#importing time libraries to add wait times
from datetime import datetime
from time import sleep
#importing beautiful soup to read the page html source code
from bs4 import BeautifulSoup
#to create csv file where we'll scrape the content
import pandas as pd
#we'll also add the options functionality to disable notifications
chrome_options = Options()
#disable notifications
chrome_options.add_argument("--disable-notifications")
chrome_options.add_argument("--disable-infobars")
#Writing three empty list
time_list = [] # Timestamp of the post
likes_list = [] # No of Likes of the post
name_list = [] # Name of the post
driver = webdriver.Chrome(ChromeDriverManager().install(),options=chrome_options)
driver.get("https://facebook.com")
driver.maximize_window()
sleep(2)
#Accept cookies
#cookies = WebDriverWait(driver,30).until(EC.element_to_be_clickable((By.XPATH,'')))
email = driver.find_element_by_id("email")
email.send_keys("XXXXXXX@gmail.com") #Email-id
password = driver.find_element_by_id("pass")
password.send_keys("XXXXXXX") # Password
sleep(2)
login=driver.find_element_by_name("login")
login.click()
sleep(2)
driver.get("https://www.facebook.com/millerreporting")
sleep(4)
while True:
soup = BeautifulSoup(driver.page_source,"html.parser")
all_posts = soup.find_all("div",{"class":"x1yztbdb x1n2onr6 xh8yej3 x1ja2u2z"})
for post in all_posts:
try:
name=post.find("a",{"class":"x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz xt0b8zv xzsf02u x1s688f"})
except:
name = "Post not found"
print(name)
try:
time=post.find("a",{"class":"x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz x1heor9g xt0b8zv xo1l8bm"})
except:
time="Time not found"
print(time)
try:
likes=post.find("span",{"class":"xrbpyxo x6ikm8r x10wlt62 xlyipyv x1exxlbk"})
except:
likes = "Likes not found"
print(likes)
name_list.append(name)
time_list.append(time)
likes_list.append(likes)
df=pd.DataFrame({"name":name_list,"time":time_list,"likes":likes_list})
#df.drop_duplicates(subset="content",keep="first",inplace = True)
df.to_csv("facebook_data1.csv")
if df.shape[0]>10:
break
if df.shape[0]>10:
break
sleep(5)
y = 500
for timer in range(0,20):
driver.execute_script("window.scrollTo(0," + str(y) + ")")
y +=500
sleep(3)
#print("You are inside the fb group & ready to close the browsr")
#driver.quit()
Aim: I want to scrape timestamp of all the posts & likes etc.
I’m scraping ( Miller Reporting Group )
First Problem is that it is not stopping at last post of the FB page.
Second in the csv file it is not extracting data,instead giving HTML.
CSV file : (facebook_data1.csv - Google Drive)
Help Needed.