Post daily jobs on slack
Tags: #remoteok #jobs #slack #gsheet #naas_drivers #automation
Author: Sanjeet Attili

Input

Import libraries

1
import pandas as pd
2
import requests
3
from datetime import datetime
4
import time
5
from naas_drivers import gsheet, slack
6
import naas
Copied!

Setup slack channel configuration

1
SLACK_TOKEN = "xoxb-1481042297777-3085654341191-xxxxxxxxxxxxxxxxxxxxxxxxx"
2
SLACK_CHANNEL = "05_jobs"
Copied!

Setup sheet log data

For the driver to fetch the contents of your google sheet, you need to share it with the service account linked with Naas first. [email protected]
1
spreadsheet_id = "1EBefhkbmqaXMZLRCiafabf6xxxxxxxxxxxxxxxxxxx"
2
sheet_name = "REMOTEOK_POSTS"
Copied!

Setup Remoteok

Setting the parameters

1
categories = ['machine learning',
2
'data science',
3
'nlp',
4
'deep learning',
5
'computer vision',
6
'data',
7
'natural language processing',
8
'data engineer']
9
date_from = -10 ### this is 10 days from now => must be negative
Copied!

Set the Scheduler

1
naas.scheduler.add(recurrence="0 9 * * *")
2
# # naas.scheduler.delete() # Uncomment this line to delete your scheduler if needed
Copied!

Model

Get the sheet log of jobs

1
try:
2
df_jobs_log = gsheet.connect(spreadsheet_id).get(sheet_name=sheet_name)
3
except KeyError as e:
4
print('Gsheet is empty!!')
5
df_jobs_log = pd.DataFrame()
Copied!

Get jobs from RemoteOk

1
REMOTEOK_API = "https://remoteok.com/api"
2
REMOTEOK_DATETIME = "%Y-%m-%dT%H:%M:%S"
3
NAAS_DATETIME = "%Y-%m-%d %H:%M:%S"
4
5
def get_jobs(remoteok_url, categories):
6
df = pd.DataFrame()
7
headers = {
8
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
9
}
10
index=0
11
for tag in categories:
12
url = remoteok_url + f"?tag={tag}"
13
res = requests.get(url, headers=headers)
14
try:
15
res.raise_for_status()
16
except requests.HTTPError as e:
17
return e
18
19
job_details = res.json()
20
21
if len(job_details)==1:
22
continue
23
else:
24
for idx, job in enumerate(job_details):
25
if idx!=0:
26
date = job['date'].split('+')[0]
27
publication_time = datetime.strptime(date, REMOTEOK_DATETIME).timestamp()
28
required_time = time.time() + date_from* 24 * 60 * 60 ### time in seconds
29
30
if publication_time >= required_time:
31
df.loc[index, 'URL'] = job.get('url')
32
df.loc[index, 'TITLE'] = job.get('position')
33
df.loc[index, 'COMPANY'] = job.get('company')
34
df.loc[index, 'TAGS'] = ", ".join(job.get('tags'))
35
df.loc[index, 'LOCATION'] = job.get('location')
36
df.loc[index, 'PUBLICATION_DATE'] = datetime.fromtimestamp(publication_time).strftime(NAAS_DATETIME)
37
index+=1
38
39
df = df.drop_duplicates(subset = 'URL', keep='first')
40
df = df.sort_values(by='PUBLICATION_DATE', ascending=False)
41
return df
42
43
df_jobs = get_jobs(REMOTEOK_API, categories)
44
df_jobs.head()
Copied!
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
Text
URL
TITLE
COMPANY
TAGS
LOCATION
PUBLICATION_DATE
1
https://remoteOK.com/remote-jobs/109456-remote...
Principal Software Engineer Data Scientist
Cardlytics
data science, golang, engineer, dev, digital n...
2022-03-09 01:15:04
2
https://remoteOK.com/remote-jobs/109342-remote...
Senior Backend Engineer
Narcissa
javascript, crypto, node, data science, senior...
Worldwide
2022-03-02 17:45:49
3
https://remoteOK.com/remote-jobs/109291-remote...
Data Science Lead Marketplace
Hipcamp
data science, marketing, engineer, exec
San Francisco, CA
2022-02-28 11:00:07
4
https://remoteOK.com/remote-jobs/109261-remote...
Data Analyst
Kikoff
mobile, data science, marketing, engineer, bac...
Remote, United States
2022-02-27 18:00:07
0
https://remoteOK.com/remote-jobs/109238-remote...
Machine Learning Engineer
Generally Intelligent
machine learning, engineer
Remote-only
2022-02-27 07:00:01

Remove duplicate jobs

1
def remove_duplicates(df1, df2):
2
# Get jobs log
3
jobs_log = df1.URL.unique()
4
5
# Exclude jobs already log from jobs
6
df2 = df2[~df2.URL.isin(jobs_log)]
7
return df2.sort_values(by="PUBLICATION_DATE")
8
9
df_new_jobs = remove_duplicates(df_jobs_log, df_jobs)
10
df_new_jobs
Copied!
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

Output

Add new jobs on the sheet log

1
gsheet.connect(spreadsheet_id).send(sheet_name=sheet_name,
2
data=df_new_jobs,
3
append=True)
Copied!
1
{}
Copied!
1
if len(df_new_jobs) > 0:
2
for _, row in df_new_jobs.iterrows():
3
url = row.url
4
slack.connect(SLACK_TOKEN).send(SLACK_CHANNEL, f"<{url}>")
5
else:
6
print("Nothing to be published in Slack !")
Copied!
1
Nothing to be published in Slack !
Copied!
Copy link
Edit on GitHub