Links

Post daily jobs on slack

Tags: #remotive #jobs #slack #gsheet #naas_drivers #automation #opendata #text
Author: Sanjeet Attili

Input

Import libraries

import pandas as pd
from bs4 import BeautifulSoup
import requests
from datetime import datetime
import time
from naas_drivers import gsheet, slack
import naas

Setup slack channel configuration

SLACK_TOKEN = "xoxb-1481042297777-3085654341191-xxxxxxxxxxxxxxxxxxxxxxxxx"
SLACK_CHANNEL = "05_work"

Setup sheet log data

spreadsheet_id = "1EBefhkbmqaXMZLRCiafabf6xxxxxxxxxxxxxxxxxxx"
sheet_name = "SLACK_CHANNEL_POSTS"

Setup Remotive

Get categories from Remotive
def get_remotejob_categories():
req_url = f"https://remotive.io/api/remote-jobs/categories"
res = requests.get(req_url)
try:
res.raise_for_status()
except requests.HTTPError as e:
return e
res_json = res.json()
# Get categories
jobs = res_json.get('jobs')
return pd.DataFrame(jobs)
df_categories = get_remotejob_categories()
df_categories
Enter your parameters
categories = ['data'] # Pick the list of categories in columns "slug"
date_from = - 10 # Choose date difference in days from now => must be negative

Set the Scheduler

naas.scheduler.add(recurrence="0 9 * * *")
# # naas.scheduler.delete() # Uncomment this line to delete your scheduler if needed

Model

Get the sheet log of jobs

df_jobs_log = gsheet.connect(spreadsheet_id).get(sheet_name=sheet_name)
df_jobs_log

Get all jobs posted after timestamp_date

All jobs posted after the date from will be fetched. In summary, we can set the value, in seconds, of 'search_data_from' to fetch all jobs posted since this duration
REMOTIVE_DATETIME = "%Y-%m-%dT%H:%M:%S"
NAAS_DATETIME = "%Y-%m-%d %H:%M:%S"
def get_remotive_jobs_since(jobs, date):
ret = []
for job in jobs:
publication_date = datetime.strptime(job['publication_date'], REMOTIVE_DATETIME).timestamp()
if publication_date > date:
ret.append({
'URL': job['url'],
'TITLE': job['title'],
'COMPANY': job['company_name'],
'PUBLICATION_DATE': datetime.fromtimestamp(publication_date).strftime(NAAS_DATETIME)
})
return ret
def get_category_jobs_since(category, date, limit):
url = f"https://remotive.io/api/remote-jobs?category={category}&limit={limit}"
res = requests.get(url)
if res.json()['jobs']:
publication_date = datetime.strptime(res.json()['jobs'][-1]['publication_date'], REMOTIVE_DATETIME).timestamp()
if len(res.json()['jobs']) < limit or date > publication_date:
print(f"Jobs from catgory {category} fetched ✅")
return get_remotive_jobs_since(res.json()['jobs'], date)
else:
return get_category_jobs_since(category, date, limit + 5)
return []
def get_jobs_since(categories: list,
date_from: int):
if date_from >= 0:
return("'date_from' must be negative. Please update your parameter.")
# Transform datefrom int to
search_jobs_from = date_from * 24 * 60 * 60 # days in seconds
timestamp_date = time.time() + search_jobs_from
jobs = []
for category in categories:
jobs += get_category_jobs_since(category, timestamp_date, 5)
print(f'- All job since {datetime.fromtimestamp(timestamp_date)} have been fetched -')
return pd.DataFrame(jobs)
df_jobs = get_jobs_since(categories, date_from=date_from)
df_jobs

Remove duplicate jobs

def remove_duplicates(df1, df2):
# Get jobs log
jobs_log = df1.URL.unique()
# Exclude jobs already log from jobs
df2 = df2[~df2.URL.isin(jobs_log)]
return df2.sort_values(by="PUBLICATION_DATE")
df_new_jobs = remove_duplicates(df_jobs_log, df_jobs)
df_new_jobs

Output

Add new jobs on the sheet log

gsheet.connect(spreadsheet_id).send(sheet_name=sheet_name,
data=df_new_jobs,
append=True)
if len(df_new_jobs) > 0:
for _, row in df_new_jobs.iterrows():
url = row.URL
slack.connect(SLACK_TOKEN).send(SLACK_CHANNEL, f"<{url}>")
else:
print("Nothing to published in Slack !")