;;; For doing a protein lookup, BLAST, and Clustal-W alignment.

;; Our biologist is clearly looking for very high scores on the
;; Clustal-W, so that will be our success criterion.

(define-framework bio

    ((a (biologist ?sequence-id) ?b) ==
     (protein-lookup ?sequence-id) => (a protein-lookup ?pd) then
     (protein-sequence ?sequence) <=  (a protein-lookup ?pd) then
     (blast-search ?sequence) => (a blast ?blast-service) then
     (blast-search ?sequences) <= (a blast ?blast-service) then
     (clustalw ?sequences) => (a clustal ?clustalw-service) then
     (clustalw ?aligned-sequences) <= (a clustal ?clustalw-service))

  ((a protein-lookup ?pd) ==
   (protein-lookup ?sequence-id) <= (a ?role ?id) then
   (protein-sequence ?sequence) => (a ?role ?id) <-- (protein-lookup ?pd ?sequence-id ?sequence))

  ((a blast ?blast-service) ==
   (blast-search ?sequence) <= (a ?role ?id) then
   (blast-search ?sequences) => (a blast ?blast-service) <-- (blast ?blast-service ?sequence ?sequences))

  ((a clustal ?clustalw-service) ==
   (clustalw ?sequences) <= (a ?role ?id) then
   (clustalw ?aligned-sequences) => (a ?role ?id) <-- (clustal ?clustalw-service ?sequences ?aligned-sequences)))

;;;

(<-- (agent-roles ?agent ?roles)
     (agent ?agent ?roles))

(<-- (agent-role-pair ?id ?role)
     (agent ?id ?roles)
     (member ?role ?roles))

(<-- (framework-agents bio ?agents)
     (hiveagents bio ?agents))


;;; This is stuff for declaring and accessing bioinformatics services
;;; and agents.

;; We will have three services: lookup, blast, and clustalw.  I don't
;; yet know how the databases will fit in.

(define-wsdl-service 'ebi-protein-database "http://www.ebi.ac.uk/ws/services/urn:Dbfetch?wsdl")
(define-wsdl-service 'ebi-blast "http://www.ebi.ac.uk/ws/WSWUBlast.wsdl")
(define-wsdl-service 'incogen-blast "http://jabba.incogen.com:8888/soapBLAST.wsdl")
(define-wsdl-service 'nig-getentry "http://xml.nig.ac.jp/wsdl/GetEntry.wsdl")
(define-wsdl-service 'nig-blast "http://xml.nig.ac.jp/wsdl/Blast.wsdl")
(define-wsdl-service 'nig-clustalw "http://xml.nig.ac.jp/wsdl/ClustalW.wsdl")

;; A protocol is a notional description of how a service ought to be
;; called, and what it should return.  We capture these as generics at
;; the moment.

;;   "Get sequence relating to SEQUENCE-ID.  Returns list holding
;;   two strings, the first is the first line of the result, and the
;;   second is the protein sequence itself."
(defconstraint protein-lookup (agent sequence-id))

(defconstraint blast (agent arg))

;;"SEQS is a list of (at present) two sequences, with the topline data."
(defconstraint clustal (agent seqs))

;;; NIG agent

(defclass <nig> (<agent>)
  ())

(setf (symbol-function 'memoized-inkove-service)
      (memo #'invoke-service 'memoized-inkove-service #'identity 'equal))

(defmethod protein-lookup ((agent <nig>) seqid)
  (let ((res (memoized-inkove-service 'nig-getentry :|getFASTA_SWISSEntry| :|accession| seqid)))
    (nig-protein-lookup-extract-result (extract-soap-result res))))

(defmethod blast ((agent <nig>) arg)
  (let ((res (memoized-inkove-service 'nig-blast :|searchSimple|
			     :|program| "blastp"
			     :|database| "SWISS"
			     :|query| (format nil "~A~%~A" (first arg) (second arg)))))
    (extract-soap-result res)))

(defmethod clustal ((agent <nig>) sequences)
  (let* ((res (memoized-inkove-service 'nig-clustalw :|analyzeSimple|
			      :|query| (apply #'concatenate 'string sequences))))
    (extract-soap-result res)))

(defun blast-extract-result (chunk)
  "List of sequence accession numbers (eg \"P90978\") from CHUNK."
  (let* ((chunk-scanner (cl-ppcre:create-scanner "^[A-z][A-z]\\|[A-Z]*[0-9]+\\|.*" :multi-line-mode t))
	 (line-scanner (cl-ppcre:create-scanner "(sp\\|[A-Z][0-9]+)|(gi\\|[0-9]+)" :single-line-mode t))
	 (lines (cl-ppcre:all-matches-as-strings chunk-scanner chunk))
	 (codes (mapcar #'(lambda (line)
			    (cl-ppcre:scan-to-strings line-scanner line))
			    lines)))
    codes))

(defun nig-protein-lookup-extract-result (str0)
  ;; Built for the NIG lookup service.
  (let* ((first-newline (position #\newline str0))
	 (str1 (subseq str0 (1+ first-newline) (length str0))))
    (list (subseq str0 0 first-newline)
	  (apply #'concatenate 'string (cl-ppcre:split "\\s+" str1)))))

;;; EBI agent

(defclass <ebi> (<agent>)
  ())

(defmethod protein-lookup ((agent <ebi>) seqid)
  (let ((res (memoized-inkove-service 'ebi-protein-database :|fetchData|
			     :|query| (format nil "uniprot:~A" seqid)
			     :|format| "default" :|style| "raw")))
    ;; XXX We need to add a dummy first line.  This ought (for our
    ;; purposes) to contain the protein ID, but EBI doesn't do it.
    (list "" (ebi-protein-lookup-extract-result res))))


(defun ebi-protein-lookup-extract-result (answer)
  (let* ((strings (mapcar #'third (cddr (third (third (third answer))))))
	 (first-line (member-if #'(lambda (str)
				    (string= str "SQ" :start1 0 :end1 2))
				strings))
	 (components (apply #'concatenate 'string (butlast (cdr first-line)))))
    (remove #\space components)))


;;; Incogen agent

(defclass <incogen> (<agent>)
  ())

(defmethod blast ((agent <incogen>) seq)
  (let ((res (memoized-inkove-service 'incogen-blast :|runRPCBLAST|
			     :|databaseName| "month.aa"
			     :|programName| "blastp"
			     :|fastaData| (format nil "~A~%~A" (first seq) (second seq)))))
    (extract-soap-result res)))

;;; Bio web

;; It is complicated by the multiplicity of databases.

;; Figure out how to rank the quality of matches on clustal-w... Use
;; different databases to get the similarities...  Where's the
;; variation come in, though?  And it looks like clustal-w is a
;; standard program: only nig seems to offer it via WSDL.

(define-pill-function protein-lookup/3 (?+agent ?+protein-id ??protein-sequence cont)
  (let* ((agent (pill::repackage (deref-exp ?+agent) :cl-user))
	 (protein-id (deref-exp ?+protein-id))
	 (result (protein-lookup (gethash agent *agents*) protein-id)))
    (if (pill::unify! ??protein-sequence result)
	(funcall cont))))

(define-pill-function blast/3 (?+agent ?+sequence ??sequences cont)
  (format t "!!! ~S ~S~%" "hi" "there")
  (let* ((agent (pill::repackage (deref-exp ?+agent) :cl-user))
	 (sequence (deref-exp ?+sequence))
	 (result (blast (gethash agent *agents*) sequence)))
    (if (pill::unify! ??sequences result)
	(funcall cont))))

(define-pill-function clustal/3 (?+agent ?+sequence ??sequences cont)
  (let* ((agent (pill::repackage (deref-exp ?+agent) :cl-user))
	 (sequence (deref-exp ?+sequence))
	 (result (blast (gethash agent *agents*) sequence)))
    (if (pill::unify! ??sequences result)
	(funcall cont))))

(defun create-agent-signatures ()
  "Returns a list of agents and roles that can be fed to the simulator."
  ;; Things like (a blast <ebi>)...
  (mapappend #'(lambda (agent)
	      (mapcar #'(lambda (constraint)
			  `(a ,constraint ,agent))
		      (agent-constraints agent)))
	  (available-agents)))

(define-pill-function create-agent-signatures/1 (??signatures cont)
  (pill::unify! ??signatures (pill::repackage (create-agent-signatures) :pill))
  (funcall cont))

;; This is just for the bio protocol.  Use the CLOS environment to
;; generate the information.
(define-pill-function bio-agents-for-role/2 (?+role ??agents cont)
  (let* ((constraint (pill::repackage (deref-exp ?+role) :cl-user))
	 (agents (remove-if #'(lambda (agent)
				(not (member constraint (agent-constraints agent))))
			    (available-agents))))
    (pill::unify! (pill::repackage agents :pill) ??agents)
    (funcall cont)))

(<-- (agents-for-role bio ?role ?agents)
     (nonvar ?role)
     (bio-agents-for-role ?role ?agents))

(<-- (bio-agents ?agents)
     (create-agent-signatures ?remotes)
     (= ?locals ((a (biologist "P49643") bill)))
     (append ?locals ?remotes ?agents))
