J'essaie d'écrire un travail cron pour exécuter périodiquement un script python que j'ai écrit, ce qui ajoutera des données à une base de données que je construis. Le script fonctionne, et cela fonctionne lorsque je suis exécuté à python /Users/me/Desktop/pythonScript/script.py
partir du terminal, mais le travail cron ne fonctionne pas. J'ai couru chmod a+x /Users/me/Desktop/pythonScript/script.py
pour rendre le script exécutable. Le script python commence également par #!/usr/bin/python
.
J'ai ajouté le résultat de $PATH
comme PATH
variable dans mon crontab
, comme indiqué ici , ainsi que l' ajout SHELL
et les HOME
variables.
crontab -l
renvoie actuellement ceci:
PATH="/Library/Frameworks/Python.framework/Versions/3.6/bin:/Users/cole/anaconda/bin:/Library/Frameworks/Python.framework/Versions/3.5/bin:/Library/Frameworks/Python.framework/Versions/3.5/bin:/Library/Frameworks/Python.framework/Versions/3.5/bin:/Library/Frameworks/Python.framework/Versions/3.5/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin:/opt/X11/bin:/Library/TeX/texbin"
SHELL="/bin/bash"
HOME = "/Users/me/Desktop/pythonScript/"
* * * * * python script.py
* * * * * env > /tmp/cronenv
Le premier travail consiste à exécuter mon script script.py
tandis que le second imprime l' cron
environnement dans le fichier tmp/cronenv
. Ce fichier ressemble à ceci:
SHELL=/bin/bash
USER=me
PATH=/Library/Frameworks/Python.framework/Versions/3.6/bin:/Users/cole/anaconda/bin:/Library/Frameworks/Python.framework/Versions/3.5/bin:/Library/Frameworks/Python.framework/Versions/3.5/bin:/Library/Frameworks/Python.framework/Versions/3.5/bin:/Library/Frameworks/Python.framework/Versions/3.5/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin:/opt/X11/bin:/Library/TeX/texbin
PWD=/Users/cole
SHLVL=1
HOME=/Users/cole
LOGNAME=cole
_=/usr/bin/env
Cependant, ma base de données ne se met pas à jour et lorsque je recherche cron
dans mon fichier system.log, je trouve les messages d'erreur suivants:
Nov 5 20:24:00 Coles-MacBook-Air-2 cron[3301]: no path for address 0x11a77b000
Nov 5 20:24:00 Coles-MacBook-Air-2 cron[3302]: no path for address 0x11a77b000
Nov 5 20:25:00 Coles-MacBook-Air-2 cron[3314]: no path for address 0x11a77b000
Nov 5 20:25:00 Coles-MacBook-Air-2 cron[3315]: no path for address 0x11a77b000
Notez qu'il y en a deux pour chaque minute, une pour chaque minute, cronjob
bien que la seconde semble fonctionner alors que la première ne fonctionne pas. Aucune suggestion?
Comme cela peut être pertinent, voici le script:
script.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
import requests
import re
from nltk import word_tokenize
import time
import pickle
saveDir = '/Users/me/Desktop/pythonScript/dbfolder' #the folder where I want to save files
workingDir = '/Users/me/Desktop/pythonScript/' #location of the script
#this function turns integer values into their url location at a gutenberg mirror
home = 'http://mirror.csclub.uwaterloo.ca/gutenberg/'
fileType = '.txt'
def urlMaker(x):
url = home
if int(x) > 10:
for j in [i for i in range(len(x)-1)]:
url += x[j]+'/'
url += x+'/'+x+fileType
else:
url = home+'0/'+x+'/'+x+fileType
return(url)
#this function takes a url and returns the .txt files at each url, as w as a list of cleaned paragraphs over 100 words in length.
def process(url):
try:
r = requests.get(url)
except ConnectionError:
time.sleep(300)
try:
r = requests.get(url)
except ConnectionError:
time.sleep(600)
try:
r = requests.get(url)
except ConnectionError:
return(ConnectionError)
toprint = r.text
text = r.text.lower()
k = re.search('\Send\Sthe small print!',text)
l = re.search('the project gutenberg etext of the declaration of independence',text)
m = re.search('start of (.*) project gutenberg (.*)*', text)
n = re.search('end of (.*) project gutenberg (.*)*', text)
o = re.search('http://gutenberg.net.au/licence.html', text)
p = re.search('this site is full of free ebooks', text)
x = 0
lst = []
if m and n:
start,end = re.escape(m.group(0)), re.escape(n.group(0))
text = re.search('{}(.*){}'.format(start, end), text, re.S).group(1)
elif o and p:
start,end = re.escape(o.group(0)), re.escape(p.group(0))
text = re.search('{}(.*){}'.format(start, end), text, re.S).group(1)
elif l and n:
start,end = re.escape(l.group(0)), re.escape(n.group(0))
text = re.search('{}(.*){}'.format(start, end), text, re.S).group(1)
elif k and n:
start,end = re.escape(k.group(0)), re.escape(n.group(0))
text = re.search('{}(.*){}'.format(start, end), text, re.S).group(1)
else:
text = text
if text.split('\n\n') != [text]:
for i in text.split('\n\n'):
if i != ''\
and 'gutenberg' not in i\
and 'ebook' not in i\
and 'etext' not in i\
and len(word_tokenize(i)) > 100:
lst += [i.replace('\n',' ')]
x = 1
if text.split('\r\n\r\n') != [text] and x == 0:
for i in text.split('\r\n\r\n'):
if i != ''\
and 'gutenberg' not in i\
and 'ebook' not in i\
and 'etext' not in i\
and len(word_tokenize(i)) > 100:
lst += [i.replace('\r\n',' ')]
return((lst,toprint))
####makes an index dictionary of the titles to the title number
indexUrl = 'http://mirror.csclub.uwaterloo.ca/gutenberg/GUTINDEX.ALL'
r = requests.get(indexUrl)
index = r.text.lower()
#plits index file by beginning and end
start = re.escape(re.search('~ ~ ~ ~ posting dates for the below ebooks: 1 oct 2017 to 31 oct 2017 ~ ~ ~ ~'\
,index).group(0))
end = re.escape(re.search('<==end of gutindex.all==>',index).group(0))
index = re.search('{}(.*){}'.format(start, end), index, re.S).group(1)
#splits file by pc line breaks
lbPC = re.split('\r\n\r\n',index)
#cleans subtitles from line using PC notation
cleanSubsPC = []
for i in lbPC:
cleanSubsPC += [i.split('\r\n')[0]]
#splits lines which use MAC notation
lbMAC = []
for i in cleanSubsPC:
if re.split('\n\n',i) == [i]:
lbMAC += [i]
else:
lbMAC += [x for x in re.split('\n\n',i)]
#cleans subtitles etc. which use MAC linebreaks
cleanSubsMAC = []
for i in lbMAC:
cleanSubsMAC += [i.split('\n')[0]]
#builds list of strings containing titles and numbers, cleaned of weird unicode stuff
textPairs = []
for i in cleanSubsMAC:
if len(i) > 1 and not i =='':
if not i.startswith('~ ~ ~ ~ posting')\
and not i.startswith('title and author'):
try:
int(i[-1])
textPairs += [i.replace('â','')\
.replace('â\xa0',' ').replace('\xa0',' ')]
except ValueError:
pass
#builds dic of key:title pairs
inDic = {}
for i in textPairs:
inDic[int(re.match('.*?([0-9]+)$', i).group(1))] = i.split(' ')[0].replace(',',' ')
#makes dictionary of urls to access
urls = {}
for x in [x for x in range(1,55863)]:
urls[x] = urlMaker(str(x))
#this opens a saved dictionary of the collected data, so the script will begin where it left off previously
try:
with open(workingDir+'gutenburgDic', 'rb') as handle:
data = pickle.load(handle)
except FileNotFoundError:
pass
#actually iterates through urls, saving data, 100 texts at a time. Also saves raw text files for later use
for i in range(len(data)+1,len(data)+101):
data[i],text = (urls[i],process(urls[i])[0]),process(urls[i])[1]
f = open(saveDir+urls[i].replace('/','.'),'w')
f.write(text)
f.close()
#saves updated dictionary of >100 word paragraphs
with open(workingDir+'gutenburgDic', 'wb') as handle:
pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)
HOME
variable pour éviter la nécessité d'un long chemin absolu vers le script? Re: le chemin de python, je suis désolé, je suis un noob, mais voulez-vous dire python installé dans mon dossier d’applications, par exemple /Macintosh HD/Applications/Python 3.6
?
/usr/local/bin/python3.6
?