xquery version "1.0-ml";

module namespace twitproc="http://www.marklogic.com/ns/nwalsh/twitter/process";

(: twitproc provides functions for processing status messages.
 :
 : These functions are mostly responsible for turning element(status)
 : messages into element(t:status) messages and inserting them in the
 : database. Along the way we add a bit of metadata, some of which will
 : be used later.
 :
 : @author Norman Walsh, norman.walsh@marklogic.com
 : @date 28 Aug 2009
 :)

import module namespace twit="http://www.marklogic.com/ns/nwalsh/twitter"
       at "/modules/twitter.xqy";

declare default function namespace "http://www.w3.org/2005/xpath-functions";

declare namespace t="http://www.marklogic.com/ns/nwalsh/twitter/tweets";
declare namespace xh="xdmp:http";

declare option xdmp:mapping "false"; 

declare variable $ENOTARRAY as xs:QName := xs:QName("t:ENOTARRAY");
declare variable $EUNPARSEDATE as xs:QName := xs:QName("t:EUNPARSEDATE");

declare variable $MONTHS
  := (<Jan>01</Jan>,<Feb>02</Feb>,<Mar>03</Mar>,<Apr>04</Apr>,
      <May>05</May>,<Jun>06</Jun>,<Jul>07</Jul>,<Aug>08</Aug>,
      <Sep>09</Sep>,<Oct>10</Oct>,<Nov>11</Nov>,<Dec>12</Dec>);

(: twitproc:update-statuses inserts all the status messages in an
 : element(t:statuses) document into the database.
 :
 : By default, if one of our own messages is in reply to another
 : status, we'll follow the chain of in-reply-to's back to the source.
 : If $mustfollow is true() then we'll follow replies even if the
 : status message isn't one of our own.
 :
 : Attempting to follow all replies for all users in your friends
 : timeline is likely to be difficult unless you have a small number
 : of friends.
 :
 : @param $account identifies the MBB user.
 : @param $statuses contains the status messages as returned from
 :        the Twitter API calls.
 : @param $mustfollow indicates whether or not we should follow
 :        conversations back through a chain of replies.
 : @return Zero or more html:li elements for the inserted statuses.
 : @throws $ENOTARRAY if the statuses element has a @type other than "array".
 :)
declare function twitproc:update-statuses($account as element(t:account),
                                          $statuses as element(statuses),
                                          $mustfollow as xs:boolean)
{
  if ($statuses/@type != 'array')
  then
    error($ENOTARRAY, "Statuses type is not array, giving up.")
  else
    if ($statuses/status)
    then
      (for $status in $statuses/status
       return
         twitproc:update-status($account, $status, $mustfollow),
       for $uid in distinct-values($statuses/status/user/id)
       return
	 twitproc:update-user($account, ($statuses/status/user[id=$uid])[1]))
    else
      ()
};

(: twitproc:update-user inserts a user into the database.
 :
 : The Twitter API includes a user in every status message. We could
 : store statuses that way, but it seems like overkill. So they're carved
 : out and stored separately.
 :
 : @param $account identifies the MBB user.
 : @param $user The user as returned in a Twitter API status message.
 : @return ()
 :)
declare function twitproc:update-user($account as element(t:account),
                                      $user as element(user))
{
  let $uri := twitproc:user-uri($account, xs:decimal($user/id))
  let $usrcoll := "http://www.marklogic.com/collections/users"
  let $tuser :=
    <t:user>
      { for $child in $user/* 
        return
	  if ($child/self::created_at)
          then
	    <t:created_at>{twitproc:parse-date($child)}</t:created_at>
          else
  	    element { xs:QName(concat("t:", local-name($child))) }
	            { string($child) }
      }
      <t:service>{string($account/t:service)}</t:service>
      <t:login>{string($account/t:screen_name)}</t:login>
    </t:user>
  return
    xdmp:document-insert($uri, $tuser, (), ($usrcoll))
};

(: twitproc:update-status inserts a status message into the database.
 :
 : By default, if one of our own messages is in reply to another
 : status, we'll follow the chain of in-reply-to's back to the source.
 : If $mustfollow is true() then we'll follow replies even if the
 : status message isn't one of our own.
 :
 : Attempting to follow all replies for all users in your friends
 : timeline is likely to be difficult unless you have a small number
 : of friends.
 :
 : In addition to translating the status message into the t: namespace,
 : we add t:service and t:login elements so that we can track who downloaded
 : this status. We also put the screen_name of the author into the message,
 : for convenience later.
 :
 : If the status is one we should follow, we add a t:reply-clean marker to
 : indicate that we should come back later and follow its in-reply-to.
 :
 : If the message contains any special prefix character (@, #, or !
 : (used on identi.ca)) or the string http:, then we add a t:text-clean
 : marker to indicate that we should come back later and clean up the text.
 :
 : @param $account identifies the MBB user.
 : @param $status contains the status messages as returned from
 :        the Twitter API calls.
 : @param $mustfollow indicates whether or not we should follow
 :        conversations back through a chain of replies.
 : @return An html:li for the inserted status.
 :)
declare function twitproc:update-status($account as element(t:account),
                                        $status as element(status),
			                $mustfollow as xs:boolean)
{
  let $uri := twitproc:status-uri($account, xs:decimal($status/id))
  let $svccoll := concat("http://www.marklogic.com/collections/tweets/",
                         $account/t:service)
  let $usrcoll := concat($svccoll, "/", $account/t:screen_name)

  let $prop
    := (<t:service>{string($account/t:service)}</t:service>,
	<t:login>{string($account/t:screen_name)}</t:login>,
	<t:screen_name>{string($status/user/screen_name)}</t:screen_name>,
	if ($status/in_reply_to_status_id != ''
	    and ($mustfollow or $status/user/screen_name = $account/t:screen_name))
	then
	  <t:reply-clean>false</t:reply-clean>
	else
	  (),
        if (contains($status/text, "@")
            or contains($status/text, "#")
            or contains($status/text, "!")
            or contains($status/text, "http:"))
        then
   	  <t:text-clean>false</t:text-clean>
        else
	  ())

  let $stat :=
    <t:status>
      { for $child in $status/* 
        return
          if ($child/self::created_at)
          then
            <t:created_at>{twitproc:parse-date($child)}</t:created_at>
          else
            if ($child/self::in_reply_to_status_id 
                or $child/self::in_reply_to_user_id
                or $child/self::in_reply_to_screen_name)
            then
              if ($child = '')
              then
                ()
              else
                element { xs:QName(concat("t:", local-name($child))) }
                        { string($child) }
            else
              if ($child/self::user) 
              then
                <t:userid>{string($child/id)}</t:userid>
              else
		if ($child/self::source)
		then
		  let $doc := xdmp:unquote($child,
		                           "http://www.w3.org/1999/xhtml",
					   ("repair-full","format-xml"))
	          return
		    <t:source>{$doc/node()}</t:source>
		else
                  element { xs:QName(concat("t:", local-name($child))) }
                          { string($child) }
      }
      {$prop}
    </t:status>

  let $dispname := string($status/user/screen_name)
  let $disptext := string($status/text)

  return
    (<li xmlns="http://www.w3.org/1999/xhtml">
       { $dispname }: { $disptext }
     </li>,
     xdmp:document-insert($uri, $stat, (), ($svccoll,$usrcoll)))
};

(: ============================================================ :)
(: twitproc:user-uri returns the database URI of a user.
 :
 : This function returns the URI that would be/will be used for
 : storing the specified user in the database.
 :
 : @param $account identifies the MBB user.
 : @param $userid identifies the user.
 : @return A database uri for the user.
 :)
declare function twitproc:user-uri($account as element(t:account),
                                   $userid as xs:decimal)
        as xs:string
{
  concat("/tweets/", $account/t:service, "/users/", $userid)
};

(: twitproc:status-uri returns the database URI of a status message.
 :
 : This function returns the URI that would be/will be used for
 : storing the specified status message in the database.
 :
 : @param $account identifies the MBB user.
 : @param $statusid identifies the message.
 : @return A database uri for the status message.
 :)
declare function twitproc:status-uri($account as element(t:account),
                                     $statusid as xs:decimal) 
        as xs:string
{
  concat("/tweets/", $account/t:service, "/statuses/", $statusid)
};

(: twitproc:parse-date returns the ISO date string for a Twitter API date.
 :
 : The Twitter API returns dates in created_at elements using a human-readable
 : notation. Not much use to us. This function turns them into xs:dateTime
 : strings.
 :
 : @param $date the date in Twitter human-readable form.
 : @return The xs:dateTime represented by that $date.
 : @throws $EUNPARSEDATE if we can't parse the $date.
 :)
declare function twitproc:parse-date($date as xs:string) as xs:dateTime {
  if (matches($date, "\S\S\S \S\S\S \d+ \d+:\d+:\d+ \+0000 \d+"))
  then
    let $toks := tokenize($date,"\s+")
    let $mon := $MONTHS[local-name() = $toks[2]]
    let $dayd := xs:decimal($toks[3])
    let $day := if ($dayd < 10) then concat("0",$dayd) else $dayd
    let $time := $toks[4]
    let $year := xs:decimal($toks[6])
    return
      xs:dateTime(concat($year,"-",$mon,"-",$day,"T",$time,"Z"))
  else
    error($EUNPARSEDATE, concat("Unparseable date: ", $date))
};
